diff --git a/.gitignore b/.gitignore
index 0b68375a..c78e6696 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,3 +81,7 @@ ipython_config.py
 venv/
 env.bak/
 venv.bak/
+
+# Smooth exploration
+gym/smooth_exploration/data*
+gym/smooth_exploration/figures*
diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py
index 80ede800..826b25e6 100644
--- a/gym/envs/__init__.py
+++ b/gym/envs/__init__.py
@@ -19,6 +19,8 @@
     "Anymal": ".anymal_c.anymal",
     "A1": ".a1.a1",
     "HumanoidRunning": ".mit_humanoid.humanoid_running",
+    "Pendulum": ".pendulum.pendulum",
+    "Lander": ".mit_humanoid.lander",
 }
 
 config_dict = {
@@ -26,10 +28,15 @@
     "MiniCheetahCfg": ".mini_cheetah.mini_cheetah_config",
     "MiniCheetahRefCfg": ".mini_cheetah.mini_cheetah_ref_config",
     "MiniCheetahOscCfg": ".mini_cheetah.mini_cheetah_osc_config",
+    "MiniCheetahSACCfg": ".mini_cheetah.mini_cheetah_SAC_config",
     "MITHumanoidCfg": ".mit_humanoid.mit_humanoid_config",
     "A1Cfg": ".a1.a1_config",
     "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config",
     "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config",
+    "PendulumCfg": ".pendulum.pendulum_config",
+    "PendulumSACCfg": ".pendulum.pendulum_SAC_config",
+    "LanderCfg": ".mit_humanoid.lander_config",
+    "PendulumPSDCfg": ".pendulum.pendulum_PSD_config",
 }
 
 runner_config_dict = {
@@ -37,10 +44,15 @@
     "MiniCheetahRunnerCfg": ".mini_cheetah.mini_cheetah_config",
     "MiniCheetahRefRunnerCfg": ".mini_cheetah.mini_cheetah_ref_config",
     "MiniCheetahOscRunnerCfg": ".mini_cheetah.mini_cheetah_osc_config",
+    "MiniCheetahSACRunnerCfg": ".mini_cheetah.mini_cheetah_SAC_config",
     "MITHumanoidRunnerCfg": ".mit_humanoid.mit_humanoid_config",
     "A1RunnerCfg": ".a1.a1_config",
     "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config",
     "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config",
+    "PendulumRunnerCfg": ".pendulum.pendulum_config",
+    "PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config",
+    "LanderRunnerCfg": ".mit_humanoid.lander_config",
+    "PendulumPSDRunnerCfg": ".pendulum.pendulum_PSD_config",
 }
 
 task_dict = {
@@ -56,14 +68,22 @@
         "MiniCheetahOscCfg",
         "MiniCheetahOscRunnerCfg",
     ],
+    "sac_mini_cheetah": [
+        "MiniCheetahRef",
+        "MiniCheetahSACCfg",
+        "MiniCheetahSACRunnerCfg"
+    ],
     "humanoid": ["MIT_Humanoid", "MITHumanoidCfg", "MITHumanoidRunnerCfg"],
     "humanoid_running": [
         "HumanoidRunning",
         "HumanoidRunningCfg",
         "HumanoidRunningRunnerCfg",
     ],
-    "a1": ["A1", "A1Cfg", "A1RunnerCfg"],
     "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"],
+    "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"],
+    "sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"],
+    "lander": ["Lander", "LanderCfg", "LanderRunnerCfg"],
+    "psd_pendulum": ["Pendulum", "PendulumPSDCfg", "PendulumPSDRunnerCfg"],
 }
 
 for class_name, class_location in class_dict.items():
diff --git a/gym/envs/a1/a1_config.py b/gym/envs/a1/a1_config.py
index cc19d8f3..87477b56 100644
--- a/gym/envs/a1/a1_config.py
+++ b/gym/envs/a1/a1_config.py
@@ -137,13 +137,13 @@ class scaling(LeggedRobotCfg.scaling):
 
 class A1RunnerCfg(LeggedRobotRunnerCfg):
     seed = -1
+    runner_class_name = "OldPolicyRunner"
 
-    class policy(LeggedRobotRunnerCfg.policy):
-        actor_hidden_dims = [256, 256, 256]
-        critic_hidden_dims = [256, 256, 256]
+    class actor(LeggedRobotRunnerCfg.actor):
+        hidden_dims = [256, 256, 256]
         # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-        actor_obs = [
+        obs = [
             "base_height",
             "base_lin_vel",
             "base_ang_vel",
@@ -154,7 +154,19 @@ class policy(LeggedRobotRunnerCfg.policy):
             "commands",
         ]
 
-        critic_obs = [
+        actions = ["dof_pos_target"]
+
+        class noise:
+            dof_pos_obs = 0.005
+            dof_vel = 0.005
+            base_ang_vel = 0.05
+            projected_gravity = 0.02
+
+    class critic:
+        hidden_dims = [256, 256, 256]
+        # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        obs = [
             "base_height",
             "base_lin_vel",
             "base_ang_vel",
@@ -165,16 +177,8 @@ class policy(LeggedRobotRunnerCfg.policy):
             "commands",
         ]
 
-        actions = ["dof_pos_target"]
-
-        class noise:
-            dof_pos_obs = 0.005  # can be made very low
-            dof_vel = 0.005
-            base_ang_vel = 0.05
-            projected_gravity = 0.02
-
-        class reward(LeggedRobotRunnerCfg.policy.reward):
-            class weights(LeggedRobotRunnerCfg.policy.reward.weights):
+        class reward:
+            class weights:
                 tracking_lin_vel = 1.0
                 tracking_ang_vel = 1.0
                 lin_vel_z = 0.0
diff --git a/gym/envs/anymal_c/flat/anymal_c_flat_config.py b/gym/envs/anymal_c/flat/anymal_c_flat_config.py
index 4ce7f214..d18f7cca 100644
--- a/gym/envs/anymal_c/flat/anymal_c_flat_config.py
+++ b/gym/envs/anymal_c/flat/anymal_c_flat_config.py
@@ -85,6 +85,7 @@ class push_robots:
         toggle = True
         interval_s = 1
         max_push_vel_xy = 0.5
+        push_box_dims = [0.2, 0.2, 0.2]
 
     class domain_rand(LeggedRobotCfg.domain_rand):
         randomize_base_mass = True
@@ -120,14 +121,14 @@ class scaling(LeggedRobotCfg.scaling):
 
 class AnymalCFlatRunnerCfg(LeggedRobotRunnerCfg):
     seed = -1
+    runner_class_name = "OldPolicyRunner"
 
-    class policy(LeggedRobotRunnerCfg.policy):
-        actor_hidden_dims = [256, 256, 256]
-        critic_hidden_dims = [256, 256, 256]
+    class actor(LeggedRobotRunnerCfg.actor):
+        hidden_dims = [256, 256, 256]
         # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
 
-        actor_obs = [
+        obs = [
             "base_height",
             "base_lin_vel",
             "base_ang_vel",
@@ -138,7 +139,19 @@ class policy(LeggedRobotRunnerCfg.policy):
             "dof_pos_history",
         ]
 
-        critic_obs = [
+        actions = ["dof_pos_target"]
+
+        class noise:
+            dof_pos_obs = 0.005
+            dof_vel = 0.005
+            base_ang_vel = 0.05  # 0.027, 0.14, 0.37
+            projected_gravity = 0.02
+
+    class critic(LeggedRobotRunnerCfg.critic):
+        hidden_dims = [256, 256, 256]
+        # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        obs = [
             "base_height",
             "base_lin_vel",
             "base_ang_vel",
@@ -149,16 +162,8 @@ class policy(LeggedRobotRunnerCfg.policy):
             "dof_pos_history",
         ]
 
-        actions = ["dof_pos_target"]
-
-        class noise:
-            dof_pos_obs = 0.005  # can be made very low
-            dof_vel = 0.005
-            base_ang_vel = 0.05  # 0.027, 0.14, 0.37
-            projected_gravity = 0.02
-
-        class reward(LeggedRobotRunnerCfg.policy.reward):
-            class weights(LeggedRobotRunnerCfg.policy.reward.weights):
+        class reward:
+            class weights:
                 tracking_lin_vel = 3.0
                 tracking_ang_vel = 1.0
                 lin_vel_z = 0.0
diff --git a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py
index badc3faa..85b0c42b 100644
--- a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py
+++ b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py
@@ -56,12 +56,15 @@ class domain_rand(AnymalCFlatCfg.domain_rand):
 
 
 class AnymalCRoughCCfgPPO(AnymalCFlatCfgPPO):
-    class policy(AnymalCFlatCfgPPO.policy):
-        actor_hidden_dims = [128, 64, 32]
+    class actor(AnymalCFlatCfgPPO.actor):
+        hidden_dims = [128, 64, 32]
         critic_hidden_dims = [128, 64, 32]
         # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
 
+    class critic(AnymalCFlatCfgPPO.critic):
+        pass
+
     class algorithm(AnymalCFlatCfgPPO.algorithm):
         entropy_coef = 0.01
 
diff --git a/gym/envs/base/base_task.py b/gym/envs/base/base_task.py
index 551143a3..033e0eeb 100644
--- a/gym/envs/base/base_task.py
+++ b/gym/envs/base/base_task.py
@@ -18,33 +18,20 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
         # * env device is GPU only if sim is on GPU and use_gpu_pipeline=True,
         # * otherwise returned tensors are copied to CPU by physX.
         if sim_device_type == "cuda" and sim_params.use_gpu_pipeline:
-            self.device = self.sim_device
+            device = self.sim_device
         else:
-            self.device = "cpu"
+            device = "cpu"
 
         # * graphics device for rendering, -1 for no rendering
         self.graphics_device_id = self.sim_device_id
 
-        self.num_envs = cfg.env.num_envs
         self.num_actuators = cfg.env.num_actuators
 
         # * optimization flags for pytorch JIT
         torch._C._jit_set_profiling_mode(False)
         torch._C._jit_set_profiling_executor(False)
 
-        # allocate buffers
-        self.to_be_reset = torch.ones(
-            self.num_envs, device=self.device, dtype=torch.bool
-        )
-        self.terminated = torch.ones(
-            self.num_envs, device=self.device, dtype=torch.bool
-        )
-        self.episode_length_buf = torch.zeros(
-            self.num_envs, device=self.device, dtype=torch.long
-        )
-        self.timed_out = torch.zeros(
-            self.num_envs, device=self.device, dtype=torch.bool
-        )
+        super().__init__(num_envs=cfg.env.num_envs, device=device)
 
         # todo: read from config
         self.enable_viewer_sync = True
diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py
index 99c038d9..f623e5d1 100644
--- a/gym/envs/base/fixed_robot.py
+++ b/gym/envs/base/fixed_robot.py
@@ -40,64 +40,60 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
         self.reset()
 
     def step(self):
-        """Apply actions, simulate, call self.post_physics_step()
-            and pre_physics_step()
-
-        Args:
-            actions (torch.Tensor): Tensor of shape
-                (num_envs, num_actions_per_env)
-        """
-
         self._reset_buffers()
-        self._pre_physics_step()
-        # * step physics and render each frame
+        self._pre_decimation_step()
         self._render()
         for _ in range(self.cfg.control.decimation):
+            self._pre_compute_torques()
             self.torques = self._compute_torques()
-
-            if self.cfg.asset.disable_motors:
-                self.torques[:] = 0.0
-            torques_to_gym_tensor = torch.zeros(
-                self.num_envs, self.num_dof, device=self.device
-            )
-
-            # todo encapsulate
-            next_torques_idx = 0
-            for dof_idx in range(self.num_dof):
-                if self.cfg.control.actuated_joints_mask[dof_idx]:
-                    torques_to_gym_tensor[:, dof_idx] = self.torques[
-                        :, next_torques_idx
-                    ]
-                    next_torques_idx += 1
-                else:
-                    torques_to_gym_tensor[:, dof_idx] = torch.zeros(
-                        self.num_envs, device=self.device
-                    )
-
-            self.gym.set_dof_actuation_force_tensor(
-                self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor)
-            )
-            self.gym.simulate(self.sim)
-            if self.device == "cpu":
-                self.gym.fetch_results(self.sim, True)
-            self.gym.refresh_dof_state_tensor(self.sim)
-
-        self._post_physics_step()
+            self._post_compute_torques()
+            self._step_physx_sim()
+            self._post_physx_step()
+        self._post_decimation_step()
         self._check_terminations_and_timeouts()
 
         env_ids = self.to_be_reset.nonzero(as_tuple=False).flatten()
         self._reset_idx(env_ids)
 
-    def _pre_physics_step(self):
-        pass
+    def _pre_decimation_step(self):
+        return None
+
+    def _pre_compute_torques(self):
+        return None
+
+    def _post_compute_torques(self):
+        if self.cfg.asset.disable_motors:
+            self.torques[:] = 0.0
 
-    def _post_physics_step(self):
+    def _step_physx_sim(self):
+        next_torques_idx = 0
+        torques_to_gym_tensor = torch.zeros(
+            self.num_envs, self.num_dof, device=self.device
+        )
+        for dof_idx in range(self.num_dof):
+            if self.cfg.control.actuated_joints_mask[dof_idx]:
+                torques_to_gym_tensor[:, dof_idx] = self.torques[:, next_torques_idx]
+                next_torques_idx += 1
+            else:
+                torques_to_gym_tensor[:, dof_idx] = torch.zeros(
+                    self.num_envs, device=self.device
+                )
+        self.gym.set_dof_actuation_force_tensor(
+            self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor)
+        )
+        self.gym.simulate(self.sim)
+        if self.device == "cpu":
+            self.gym.fetch_results(self.sim, True)
+        self.gym.refresh_dof_state_tensor(self.sim)
+
+    def _post_physx_step(self):
         """
         check terminations, compute observations and rewards
         """
         self.gym.refresh_actor_root_state_tensor(self.sim)
         self.gym.refresh_net_contact_force_tensor(self.sim)
 
+    def _post_decimation_step(self):
         self.episode_length_buf += 1
         self.common_step_counter += 1
 
@@ -188,10 +184,7 @@ def _process_dof_props(self, props, env_id):
                 try:  # ! this seems like a crime...
                     self.torque_limits[i] = props["effort"][i].item()
                 except:
-                    print(
-                        "WARNING: passive joints need to be listed after "
-                        + "active joints in the URDF."
-                    )
+                    print("WARNING: your system has unactuated joints")
                 # * soft limits
                 m = (self.dof_pos_limits[i, 0] + self.dof_pos_limits[i, 1]) / 2
                 r = self.dof_pos_limits[i, 1] - self.dof_pos_limits[i, 0]
@@ -212,18 +205,6 @@ def _process_rigid_body_props(self, props, env_id):
         return props
 
     def _compute_torques(self):
-        """Compute torques from actions.
-            Actions can be interpreted as position or velocity targets given
-            to a PD controller, or directly as scaled torques.
-            [NOTE]: torques must have the same dimension as the number of DOFs,
-                even if some DOFs are not actuated.
-
-        Args:
-            actions (torch.Tensor): Actions
-
-        Returns:
-            [torch.Tensor]: Torques sent to the simulation
-        """
         actuated_dof_pos = torch.zeros(
             self.num_envs, self.num_actuators, device=self.device
         )
@@ -415,10 +396,7 @@ def _init_buffers(self):
         self.default_act_pos = self.default_act_pos.unsqueeze(0)
         # * store indices of actuated joints
         self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device)
-        # * check that init range highs and lows are consistent
-        # * and repopulate to match
-        if self.cfg.init_state.reset_mode == "reset_to_range":
-            self.initialize_ranges_for_initial_conditions()
+        self.initialize_ranges_for_initial_conditions()
 
     def initialize_ranges_for_initial_conditions(self):
         self.dof_pos_range = torch.zeros(
@@ -584,50 +562,42 @@ def _get_env_origins(self):  # TODO: do without terrain
 
     # ------------ reward functions----------------
 
-    def _sqrdexp(self, x, scale=1.0):
+    def _sqrdexp(self, x, sigma=None):
         """shorthand helper for squared exponential"""
-        return torch.exp(
-            -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma
-        )
+        if sigma is None:
+            return torch.exp(-torch.square(x) / self.cfg.reward_settings.tracking_sigma)
+        else:
+            return torch.exp(-torch.square(x) / sigma)
 
     def _reward_torques(self):
         """Penalize torques"""
-        return -torch.sum(torch.square(self.torques), dim=1)
+        return -torch.mean(torch.square(self.torques), dim=1)
 
     def _reward_dof_vel(self):
         """Penalize dof velocities"""
-        return -torch.sum(torch.square(self.dof_vel), dim=1)
+        return -torch.mean(torch.square(self.dof_vel), dim=1)
 
     def _reward_action_rate(self):
         """Penalize changes in actions"""
         nact = self.num_actuators
-        dt2 = (self.dt * self.cfg.control.decimation) ** 2
-        error = (
-            torch.square(
-                self.dof_pos_history[:, :nact]
-                - self.dof_pos_history[:, nact : 2 * nact]
-            )
-            / dt2
+        error = torch.square(
+            self.dof_pos_history[:, :nact] - self.dof_pos_history[:, 2 * nact :]
         )
-        return -torch.sum(error, dim=1)
+        return -torch.mean(error, dim=1)
 
     def _reward_action_rate2(self):
         """Penalize changes in actions"""
         nact = self.num_actuators
-        dt2 = (self.dt * self.cfg.control.decimation) ** 2
-        error = (
-            torch.square(
-                self.dof_pos_history[:, :nact]
-                - 2 * self.dof_pos_history[:, nact : 2 * nact]
-                + self.dof_pos_history[:, 2 * nact :]
-            )
-            / dt2
+        error = torch.square(
+            self.dof_pos_history[:, :nact]
+            - 2 * self.dof_pos_history[:, nact : 2 * nact]
+            + self.dof_pos_history[:, 2 * nact :]
         )
-        return -torch.sum(error, dim=1)
+        return -torch.mean(error, dim=1)
 
     def _reward_collision(self):
         """Penalize collisions on selected bodies"""
-        return -torch.sum(
+        return -torch.mean(
             1.0
             * (
                 torch.norm(
@@ -649,11 +619,11 @@ def _reward_dof_pos_limits(self):
             max=0.0
         )  # lower limit
         out_of_limits += (self.dof_pos - self.dof_pos_limits[:, 1]).clip(min=0.0)
-        return -torch.sum(out_of_limits, dim=1)
+        return -torch.mean(out_of_limits, dim=1)
 
     def _reward_dof_vel_limits(self):
         """Penalize dof velocities too close to the limit"""
         # * clip to max error = 1 rad/s per joint to avoid huge penalties
         limit = self.cfg.reward_settings.soft_dof_vel_limit
         error = self.dof_vel.abs() - self.dof_vel_limits * limit
-        return -torch.sum(error.clip(min=0.0, max=1.0), dim=1)
+        return -torch.mean(error.clip(min=0.0, max=1.0), dim=1)
diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py
index a59cc9eb..60160b9c 100644
--- a/gym/envs/base/fixed_robot_config.py
+++ b/gym/envs/base/fixed_robot_config.py
@@ -123,35 +123,34 @@ class FixedRobotCfgPPO(BaseConfig):
     class logging:
         enable_local_saving = True
 
-    class policy:
+    class actor:
         init_noise_std = 1.0
-        actor_hidden_dims = [512, 256, 128]
-        critic_hidden_dims = [512, 256, 128]
+        hidden_dims = [512, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-        # only for 'ActorCriticRecurrent':
-        # rnn_type = 'lstm'
-        # rnn_hidden_size = 512
-        # rnn_num_layers = 1
-
-        actor_obs = [
+        obs = [
             "observation_a",
             "observation_b",
             "these_need_to_be_atributes_(states)_of_the_robot_env",
         ]
+        normalize_obs = True
+        smooth_exploration = False
+        actions = ["tau_ff"]
+        disable_actions = False
 
-        critic_obs = [
+        class noise:
+            observation_a = 0.1  # implement as needed, also in your robot class
+
+    class critic:
+        hidden_dims = [512, 256, 128]
+        activation = "elu"
+        normalize_obs = True
+        obs = [
             "observation_x",
             "observation_y",
             "critic_obs_can_be_the_same_or_different_than_actor_obs",
         ]
 
-        actions = ["tau_ff"]
-        disable_actions = False
-
-        class noise:
-            noise = 0.1  # implement as needed, also in your robot class
-
         class rewards:
             class weights:
                 torques = 0.0
@@ -165,24 +164,29 @@ class termination_weight:
                 termination = 0.0
 
     class algorithm:
-        # * training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
+        # both
+        gamma = 0.99
+        lam = 0.95
+        # shared
+        batch_size = 2**15
+        max_gradient_steps = 24
+        # new
+        storage_size = 2**17  # new
+        batch_size = 2**15  #  new
+
         clip_param = 0.2
-        entropy_coef = 0.01
-        num_learning_epochs = 5
-        # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 4
         learning_rate = 1.0e-3
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
+        entropy_coef = 0.01
         schedule = "adaptive"  # could be adaptive, fixed
-        gamma = 0.99
-        lam = 0.95
         desired_kl = 0.01
-        max_grad_norm = 1.0
 
     class runner:
         policy_class_name = "ActorCritic"
-        algorithm_class_name = "PPO"
+        algorithm_class_name = "PPO2"
         num_steps_per_env = 24  # per iteration
         max_iterations = 500  # number of policy updates
 
diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py
index e02507d5..c80ebc85 100644
--- a/gym/envs/base/legged_robot.py
+++ b/gym/envs/base/legged_robot.py
@@ -110,12 +110,12 @@ def _post_decimation_step(self):
 
         self.base_height = self.root_states[:, 2:3]
 
-        n = self.num_actuators
-        self.dof_pos_history[:, 2 * n :] = self.dof_pos_history[:, n : 2 * n]
-        self.dof_pos_history[:, n : 2 * n] = self.dof_pos_history[:, :n]
-        self.dof_pos_history[:, :n] = self.dof_pos_target
         self.dof_pos_obs = self.dof_pos - self.default_dof_pos
 
+        self.dof_pos_history = self.dof_pos_history.roll(self.num_actuators)
+        # self.dof_pos_history[:, : self.num_actuators] = self.dof_pos_obs
+        self.dof_pos_history[:, : self.num_actuators] = self.dof_pos_target
+
         env_ids = (
             self.episode_length_buf % int(self.cfg.commands.resampling_time / self.dt)
             == 0
@@ -133,7 +133,10 @@ def _reset_idx(self, env_ids):
         self._reset_system(env_ids)
         self._resample_commands(env_ids)
         # * reset buffers
-        self.dof_pos_history[env_ids] = 0.0
+        self.dof_pos_obs[env_ids] = self.dof_pos[env_ids] - self.default_dof_pos
+        # self.dof_pos_history[env_ids] = self.dof_pos_obs[env_ids].tile(3)
+        self.dof_pos_target[env_ids] = self.default_dof_pos
+        self.dof_pos_history[env_ids] = self.dof_pos_target[env_ids].tile(3)
         self.episode_length_buf[env_ids] = 0
 
     def _initialize_sim(self):
@@ -481,41 +484,26 @@ def _init_buffers(self):
             get_axis_params(-1.0, self.up_axis_idx), device=self.device
         ).repeat((self.num_envs, 1))
         self.torques = torch.zeros(
-            self.num_envs,
-            self.num_actuators,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.p_gains = torch.zeros(
-            self.num_actuators, dtype=torch.float, device=self.device
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.d_gains = torch.zeros(
-            self.num_actuators, dtype=torch.float, device=self.device
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.dof_pos_target = torch.zeros(
-            self.num_envs,
-            self.num_actuators,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.dof_vel_target = torch.zeros(
-            self.num_envs,
-            self.num_actuators,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.tau_ff = torch.zeros(
-            self.num_envs,
-            self.num_actuators,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
 
         self.dof_pos_history = torch.zeros(
-            self.num_envs,
-            self.num_actuators * 3,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators * 3, dtype=torch.float, device=self.device
         )
         self.commands = torch.zeros(
             self.num_envs, 3, dtype=torch.float, device=self.device
@@ -532,6 +520,29 @@ def _init_buffers(self):
             self.num_envs, 1, dtype=torch.float, device=self.device
         )
 
+        # # * get the body_name to body_index dict
+        # body_dict = self.gym.get_actor_rigid_body_dict(
+        #     self.envs[0], self.actor_handles[0]
+        # )
+        # # * extract a list of body_names where the index is the id number
+        # body_names = [
+        #     body_tuple[0]
+        #     for body_tuple in sorted(
+        #         body_dict.items(), key=lambda body_tuple: body_tuple[1]
+        #     )
+        # ]
+        # # * construct a list of id numbers corresponding to end_effectors
+        # self.end_effector_ids = []
+        # for end_effector_name in self.cfg.asset.foot_collisionbox_names:
+        #     self.end_effector_ids.extend(
+        #         [
+        #             body_names.index(body_name)
+        #             for body_name in body_names
+        #             if end_effector_name in body_name
+        #         ]
+        #     )
+        # # ----------------------------------------
+
         if self.cfg.terrain.measure_heights:
             self.height_points = self._init_height_points()
         self.measured_heights = 0
@@ -559,8 +570,8 @@ def _init_buffers(self):
             found = False
             for dof_name in self.cfg.control.stiffness.keys():
                 if dof_name in name:
-                    self.p_gains[i] = self.cfg.control.stiffness[dof_name]
-                    self.d_gains[i] = self.cfg.control.damping[dof_name]
+                    self.p_gains[:, i] = self.cfg.control.stiffness[dof_name]
+                    self.d_gains[:, i] = self.cfg.control.damping[dof_name]
                     found = True
             if not found:
                 self.p_gains[i] = 0.0
@@ -980,11 +991,12 @@ def _get_heights(self, env_ids=None):
 
         return heights.view(self.num_envs, -1) * self.terrain.cfg.vertical_scale
 
-    def _sqrdexp(self, x, scale=1.0):
+    def _sqrdexp(self, x, sigma=None):
         """shorthand helper for squared exponential"""
-        return torch.exp(
-            -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma
-        )
+        if sigma is None:
+            return torch.exp(-torch.square(x) / self.cfg.reward_settings.tracking_sigma)
+        else:
+            return torch.exp(-torch.square(x) / sigma)
 
     # ------------ reward functions----------------
 
@@ -994,11 +1006,11 @@ def _reward_lin_vel_z(self):
 
     def _reward_ang_vel_xy(self):
         """Penalize xy axes base angular velocity"""
-        return -torch.sum(torch.square(self.base_ang_vel[:, :2]), dim=1)
+        return -torch.mean(torch.square(self.base_ang_vel[:, :2]), dim=1)
 
     def _reward_orientation(self):
         """Penalize non flat base orientation"""
-        return -torch.sum(torch.square(self.projected_gravity[:, :2]), dim=1)
+        return -torch.mean(torch.square(self.projected_gravity[:, :2]), dim=1)
 
     def _reward_base_height(self):
         """Penalize base height away from target"""
@@ -1009,37 +1021,29 @@ def _reward_base_height(self):
 
     def _reward_torques(self):
         """Penalize torques"""
-        return -torch.sum(torch.square(self.torques), dim=1)
+        return -torch.mean(torch.square(self.torques), dim=1)
 
     def _reward_dof_vel(self):
         """Penalize dof velocities"""
-        return -torch.sum(torch.square(self.dof_vel), dim=1)
+        return -torch.mean(torch.square(self.dof_vel), dim=1)
 
     def _reward_action_rate(self):
         """Penalize changes in actions"""
         n = self.num_actuators
-        dt2 = (self.dt * self.cfg.control.decimation) ** 2
-        error = (
-            torch.square(
-                self.dof_pos_history[:, :n] - self.dof_pos_history[:, n : 2 * n]
-            )
-            / dt2
+        error = torch.square(
+            self.dof_pos_history[:, :n] - self.dof_pos_history[:, 2 * n :]
         )
-        return -torch.sum(error, dim=1)
+        return -torch.mean(error, dim=1)
 
     def _reward_action_rate2(self):
         """Penalize changes in actions"""
         n = self.num_actuators
-        dt2 = (self.dt * self.cfg.control.decimation) ** 2
-        error = (
-            torch.square(
-                self.dof_pos_history[:, :n]
-                - 2 * self.dof_pos_history[:, n : 2 * n]
-                + self.dof_pos_history[:, 2 * n :]
-            )
-            / dt2
+        error = torch.square(
+            self.dof_pos_history[:, :n]
+            - 2 * self.dof_pos_history[:, n : 2 * n]
+            + self.dof_pos_history[:, 2 * n :]
         )
-        return -torch.sum(error, dim=1)
+        return -torch.mean(error, dim=1)
 
     def _reward_collision(self):
         """Penalize collisions on selected bodies"""
@@ -1064,26 +1068,26 @@ def _reward_dof_pos_limits(self):
         # * lower limit
         out_of_limits = -(self.dof_pos - self.dof_pos_limits[:, 0]).clip(max=0.0)
         out_of_limits += (self.dof_pos - self.dof_pos_limits[:, 1]).clip(min=0.0)
-        return -torch.sum(out_of_limits, dim=1)
+        return -torch.mean(out_of_limits, dim=1)
 
     def _reward_dof_vel_limits(self):
         """Penalize dof velocities too close to the limit"""
         # * clip to max error = 1 rad/s per joint to avoid huge penalties
         limit = self.cfg.reward_settings.soft_dof_vel_limit
         error = self.dof_vel.abs() - self.dof_vel_limits * limit
-        return -torch.sum(error.clip(min=0.0, max=1.0), dim=1)
+        return -torch.mean(error.clip(min=0.0, max=1.0), dim=1)
 
     def _reward_torque_limits(self):
         """penalize torques too close to the limit"""
         limit = self.cfg.reward_settings.soft_torque_limit
         error = self.torques.abs() - self.torque_limits * limit
-        return -torch.sum(error.clip(min=0.0, max=1.0), dim=1)
+        return -torch.mean(error.clip(min=0.0, max=1.0), dim=1)
 
     def _reward_tracking_lin_vel(self):
         """Tracking of linear velocity commands (xy axes)"""
         error = torch.square(self.commands[:, :2] - self.base_lin_vel[:, :2])
         error = torch.exp(-error / self.cfg.reward_settings.tracking_sigma)
-        return torch.sum(error, dim=1)
+        return torch.mean(error, dim=1)
 
     def _reward_tracking_ang_vel(self):
         """Tracking of angular velocity commands (yaw)"""
@@ -1092,7 +1096,7 @@ def _reward_tracking_ang_vel(self):
 
     def _reward_feet_contact_forces(self):
         """penalize high contact forces"""
-        return -torch.sum(
+        return -torch.mean(
             (
                 torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
                 - self.cfg.reward_settings.max_contact_force
diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py
index 0de83e0e..c38f3e33 100644
--- a/gym/envs/base/legged_robot_config.py
+++ b/gym/envs/base/legged_robot_config.py
@@ -163,7 +163,7 @@ class asset:
         file = ""
         # * name of the feet bodies,
         # * used to index body state and contact force tensors
-        foot_name = "None"
+        foot_name = "foot"
         penalize_contacts_on = []
         terminate_after_contacts_on = []
         end_effector_names = []
@@ -233,15 +233,15 @@ class LeggedRobotRunnerCfg(BaseConfig):
     class logging:
         enable_local_saving = True
 
-    class policy:
+    class actor:
         init_noise_std = 1.0
-        actor_hidden_dims = [512, 256, 128]
-        critic_hidden_dims = [512, 256, 128]
+        hidden_dims = [512, 256, 128]
         # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
         normalize_obs = True
+        smooth_exploration = False
 
-        actor_obs = [
+        obs = [
             "observation_a",
             "observation_b",
             "these_need_to_be_atributes_(states)_of_the_robot_env",
@@ -263,6 +263,17 @@ class noise:
             projected_gravity = 0.05
             height_measurements = 0.1
 
+    class critic:
+        hidden_dims = [512, 256, 128]
+        # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        normalize_obs = True
+        obs = [
+            "observation_x",
+            "observation_y",
+            "critic_obs_can_be_the_same_or_different_than_actor_obs",
+        ]
+
         class reward:
             class weights:
                 tracking_lin_vel = 0.0
@@ -283,25 +294,29 @@ class termination_weight:
                 termination = 0.01
 
     class algorithm:
-        # * training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
+        # both
+        gamma = 0.99
+        lam = 0.95
+        # shared
+        batch_size = 2**15
+        max_gradient_steps = 24
+        # new
         clip_param = 0.2
-        entropy_coef = 0.01
-        num_learning_epochs = 5
-        # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 4
         learning_rate = 1.0e-3
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
+        entropy_coef = 0.01
         schedule = "adaptive"  # could be adaptive, fixed
-        gamma = 0.99
-        lam = 0.95
         desired_kl = 0.01
-        max_grad_norm = 1.0
+        lr_range = [2e-5, 1e-2]
+        lr_ratio = 1.5
 
     class runner:
         policy_class_name = "ActorCritic"
-        algorithm_class_name = "PPO"
-        num_steps_per_env = 24
+        algorithm_class_name = "PPO2"
+        num_steps_per_env = 24  # deprecate
         max_iterations = 1500
         save_interval = 50
         run_name = ""
diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py
index ffbac6ce..4dfcb280 100644
--- a/gym/envs/base/task_skeleton.py
+++ b/gym/envs/base/task_skeleton.py
@@ -5,10 +5,15 @@
 
 
 class TaskSkeleton:
-    def __init__(self, num_envs=1, max_episode_length=1.0, device="cpu"):
+    def __init__(self, num_envs=1, device="cpu"):
         self.num_envs = num_envs
-        self.max_episode_length = max_episode_length
         self.device = device
+
+        self.to_be_reset = torch.ones(num_envs, device=device, dtype=torch.bool)
+        self.terminated = torch.ones(num_envs, device=device, dtype=torch.bool)
+        self.episode_length_buf = torch.zeros(num_envs, device=device, dtype=torch.long)
+        self.timed_out = torch.zeros(num_envs, device=device, dtype=torch.bool)
+
         return None
 
     def get_states(self, obs_list):
@@ -45,31 +50,19 @@ def reset(self):
         """Reset all robots"""
         self._reset_idx(torch.arange(self.num_envs, device=self.device))
         self.step()
+        self.episode_length_buf[:] = 0
 
     def _reset_buffers(self):
         self.to_be_reset[:] = False
         self.terminated[:] = False
         self.timed_out[:] = False
 
-    def compute_reward(self, reward_weights):
-        """Compute and return a torch tensor of rewards
-        reward_weights: dict with keys matching reward names, and values
-            matching weights
-        """
-        reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float)
-        for name, weight in reward_weights.items():
-            reward += weight * self._eval_reward(name)
-        return reward
-
-    def _eval_reward(self, name):
-        return eval("self._reward_" + name + "()")
-
     def _check_terminations_and_timeouts(self):
         """Check if environments need to be reset"""
         contact_forces = self.contact_forces[:, self.termination_contact_indices, :]
-        self.terminated = torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
+        self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
         self.timed_out = self.episode_length_buf >= self.max_episode_length
-        self.to_be_reset = self.timed_out | self.terminated
+        # self.to_be_reset = self.timed_out | self.terminated
 
     def step(self, actions):
         raise NotImplementedError
diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py
index d8a19292..c3489021 100644
--- a/gym/envs/cartpole/cartpole_config.py
+++ b/gym/envs/cartpole/cartpole_config.py
@@ -7,7 +7,7 @@ class CartpoleCfg(FixedRobotCfg):
     class env(FixedRobotCfg.env):
         num_envs = 4024
         num_actuators = 1  # 1 for the cart force
-        episode_length_s = 20
+        episode_length_s = 5.0
 
     class terrain(FixedRobotCfg.terrain):
         pass
@@ -35,7 +35,8 @@ class control(FixedRobotCfg.control):
             1,  # slider_to_cart
             0,
         ]  # cart_to_pole
-
+        stiffness = {"slider_to_cart": 0.0}
+        damping = {"slider_to_cart": 0.0}
         ctrl_frequency = 250
         desired_sim_frequency = 500
 
@@ -67,15 +68,14 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO):
     seed = -1
     runner_class_name = "OnPolicyRunner"
 
-    class policy(FixedRobotCfgPPO.policy):
+    class actor(FixedRobotCfgPPO.actor):
         init_noise_std = 1.0
         num_layers = 2
         num_units = 32
-        actor_hidden_dims = [num_units] * num_layers
-        critic_hidden_dims = [num_units] * num_layers
+        hidden_dims = [num_units] * num_layers
         activation = "elu"
 
-        actor_obs = [
+        obs = [
             "cart_obs",
             "pole_trig_obs",
             "dof_vel",
@@ -83,8 +83,6 @@ class policy(FixedRobotCfgPPO.policy):
             "pole_vel_square",
         ]
 
-        critic_obs = actor_obs
-
         actions = ["tau_ff"]
 
         class noise:
@@ -94,6 +92,20 @@ class noise:
             pole_vel = 0.010
             actuation = 0.00
 
+    class critic:
+        num_layers = 2
+        num_units = 32
+        hidden_dims = [num_units] * num_layers
+        activation = "elu"
+
+        obs = [
+            "cart_obs",
+            "pole_trig_obs",
+            "dof_vel",
+            "cart_vel_square",
+            "pole_vel_square",
+        ]
+
         class reward:
             class weights:
                 pole_pos = 5
@@ -108,25 +120,12 @@ class termination_weight:
                 termination = 0.0
 
     class algorithm(FixedRobotCfgPPO.algorithm):
-        # training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
-        clip_param = 0.2
-        entropy_coef = 0.01
-        num_learning_epochs = 5
-        # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 4
-        learning_rate = 1.0e-3
-        schedule = "adaptive"  # could be adaptive, fixed
-        discount_horizon = 1.0  # [s]
-        GAE_bootstrap_horizon = 0.2  # [s]
-        desired_kl = 0.01
-        max_grad_norm = 1.0
+        pass
 
     class runner(FixedRobotCfgPPO.runner):
         policy_class_name = "ActorCritic"
-        algorithm_class_name = "PPO"
-        num_steps_per_env = 96  # per iteration
+        algorithm_class_name = "PPO2"
+        num_steps_per_env = 32  # per iteration
         max_iterations = 500  # number of policy updates
 
         # * logging
diff --git a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py
new file mode 100644
index 00000000..13348b69
--- /dev/null
+++ b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py
@@ -0,0 +1,183 @@
+from gym.envs.base.legged_robot_config import LeggedRobotRunnerCfg
+from gym.envs.mini_cheetah.mini_cheetah_ref_config import MiniCheetahRefCfg
+
+BASE_HEIGHT_REF = 0.3
+
+
+class MiniCheetahSACCfg(MiniCheetahRefCfg):
+    class env(MiniCheetahRefCfg.env):
+        num_envs = 1
+        episode_length_s = 4  # TODO
+
+    class terrain(MiniCheetahRefCfg.terrain):
+        pass
+
+    class init_state(MiniCheetahRefCfg.init_state):
+        pass
+
+    class control(MiniCheetahRefCfg.control):
+        # * PD Drive parameters:
+        stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0}
+        damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5}
+        gait_freq = 3.0
+        ctrl_frequency = 20  # TODO
+        desired_sim_frequency = 100
+
+    class commands(MiniCheetahRefCfg.commands):
+        pass
+
+    class push_robots(MiniCheetahRefCfg.push_robots):
+        pass
+
+    class domain_rand(MiniCheetahRefCfg.domain_rand):
+        pass
+
+    class asset(MiniCheetahRefCfg.asset):
+        file = (
+            "{LEGGED_GYM_ROOT_DIR}/resources/robots/"
+            + "mini_cheetah/urdf/mini_cheetah_simple.urdf"
+        )
+        foot_name = "foot"
+        penalize_contacts_on = ["shank"]
+        terminate_after_contacts_on = ["base"]
+        end_effector_names = ["foot"]
+        collapse_fixed_joints = False
+        self_collisions = 1
+        flip_visual_attachments = False
+        disable_gravity = False
+        disable_motors = False
+        joint_damping = 0.1
+        rotor_inertia = [0.002268, 0.002268, 0.005484] * 4
+
+    class reward_settings(MiniCheetahRefCfg.reward_settings):
+        soft_dof_pos_limit = 0.9
+        soft_dof_vel_limit = 0.9
+        soft_torque_limit = 0.9
+        max_contact_force = 600.0
+        base_height_target = BASE_HEIGHT_REF
+        tracking_sigma = 0.25
+
+    class scaling(MiniCheetahRefCfg.scaling):
+        base_ang_vel = 0.3
+        base_lin_vel = BASE_HEIGHT_REF
+        dof_vel = 4 * [2.0, 2.0, 4.0]
+        base_height = 0.3
+        dof_pos = 4 * [0.2, 0.3, 0.3]
+        dof_pos_obs = dof_pos
+        dof_pos_target = 4 * [0.2, 0.3, 0.3]
+        tau_ff = 4 * [18, 18, 28]
+        commands = [3, 1, 3]
+
+
+class MiniCheetahSACRunnerCfg(LeggedRobotRunnerCfg):
+    seed = -1
+    runner_class_name = "OffPolicyRunner"
+
+    class actor(LeggedRobotRunnerCfg.actor):
+        hidden_dims = {
+            "latent": [128, 128],
+            "mean": [64],
+            "std": [64],
+        }
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = {
+            "latent": "elu",
+            "mean": "elu",
+            "std": "elu",
+        }
+
+        # TODO[lm]: Handle normalization
+        normalize_obs = False
+        obs = [
+            "base_ang_vel",
+            "projected_gravity",
+            "commands",
+            "dof_pos_obs",
+            "dof_vel",
+            "phase_obs",
+        ]
+
+        actions = ["dof_pos_target"]
+        add_noise = False  # TODO
+        disable_actions = False
+
+        class noise:
+            scale = 1.0
+            dof_pos_obs = 0.01
+            base_ang_vel = 0.01
+            dof_pos = 0.005
+            dof_vel = 0.005
+            lin_vel = 0.05
+            ang_vel = [0.3, 0.15, 0.4]
+            gravity_vec = 0.1
+
+    class critic(LeggedRobotRunnerCfg.critic):
+        hidden_dims = [128, 128, 64]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+
+        # TODO[lm]: Handle normalization
+        normalize_obs = False
+        obs = [
+            "base_height",
+            "base_lin_vel",
+            "base_ang_vel",
+            "projected_gravity",
+            "commands",
+            "dof_pos_obs",
+            "dof_vel",
+            "phase_obs",
+            "dof_pos_target",
+        ]
+
+        class reward:
+            class weights:
+                tracking_lin_vel = 4.0
+                tracking_ang_vel = 2.0
+                lin_vel_z = 0.0
+                ang_vel_xy = 0.01
+                orientation = 1.0
+                torques = 5.0e-7
+                dof_vel = 0.0
+                min_base_height = 1.5
+                collision = 0.0
+                action_rate = 0.01
+                action_rate2 = 0.001
+                stand_still = 0.0
+                dof_pos_limits = 0.0
+                feet_contact_forces = 0.0
+                dof_near_home = 0.0
+                reference_traj = 1.5
+                swing_grf = 1.5
+                stance_grf = 1.5
+
+            class termination_weight:
+                termination = 0.15
+
+    class algorithm(LeggedRobotRunnerCfg.algorithm):
+        # Taken from SAC pendulum
+        initial_fill = 500
+        storage_size = 10**6
+        batch_size = 256
+        max_gradient_steps = 1  # 10
+        action_max = 1.0  # TODO
+        action_min = -1.0  # TODO
+        actor_noise_std = 0.5  # TODO
+        log_std_max = 4.0
+        log_std_min = -20.0
+        alpha = 0.2
+        target_entropy = -12.0  # -action_dim
+        max_grad_norm = 1.0
+        polyak = 0.995  # flipped compared to SB3 (polyak == 1-tau)
+        gamma = 0.99
+        alpha_lr = 1e-4
+        actor_lr = 1e-4
+        critic_lr = 1e-4
+
+    class runner(LeggedRobotRunnerCfg.runner):
+        run_name = ""
+        experiment_name = "sac_mini_cheetah"
+        max_iterations = 50_000
+        algorithm_class_name = "SAC"
+        save_interval = 10_000
+        num_steps_per_env = 1
diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
index de015b21..ba2655f7 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -124,25 +124,13 @@ class scaling(LeggedRobotCfg.scaling):
 
 class MiniCheetahRunnerCfg(LeggedRobotRunnerCfg):
     seed = -1
+    runner_class_name = "OnPolicyRunner"
 
-    class policy(LeggedRobotRunnerCfg.policy):
-        actor_hidden_dims = [256, 256, 128]
-        critic_hidden_dims = [128, 64]
+    class actor(LeggedRobotRunnerCfg.actor):
+        hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-
-        actor_obs = [
-            "base_lin_vel",
-            "base_ang_vel",
-            "projected_gravity",
-            "commands",
-            "dof_pos_obs",
-            "dof_vel",
-            "dof_pos_target",
-        ]
-        critic_obs = [
-            "base_height",
-            "base_lin_vel",
+        obs = [
             "base_ang_vel",
             "projected_gravity",
             "commands",
@@ -150,9 +138,10 @@ class policy(LeggedRobotRunnerCfg.policy):
             "dof_vel",
             "dof_pos_target",
         ]
-
+        normalize_obs = False
         actions = ["dof_pos_target"]
         add_noise = True
+        disable_actions = False
 
         class noise:
             scale = 1.0
@@ -164,8 +153,23 @@ class noise:
             ang_vel = [0.3, 0.15, 0.4]
             gravity_vec = 0.1
 
-        class reward(LeggedRobotRunnerCfg.policy.reward):
-            class weights(LeggedRobotRunnerCfg.policy.reward.weights):
+    class critic(LeggedRobotRunnerCfg.critic):
+        hidden_dims = [128, 64]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        obs = [
+            "base_height",
+            "base_lin_vel",
+            "base_ang_vel",
+            "projected_gravity",
+            "commands",
+            "dof_pos_obs",
+            "dof_vel",
+            "dof_pos_target",
+        ]
+
+        class reward:
+            class weights:
                 tracking_lin_vel = 4.0
                 tracking_ang_vel = 2.0
                 lin_vel_z = 0.0
@@ -185,24 +189,11 @@ class termination_weight:
                 termination = 0.01
 
     class algorithm(LeggedRobotRunnerCfg.algorithm):
-        # * training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
-        clip_param = 0.2
-        entropy_coef = 0.02
-        num_learning_epochs = 4
-        # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 8
-        learning_rate = 1.0e-5
-        schedule = "adaptive"  # can be adaptive or fixed
-        discount_horizon = 1.0  # [s]
-        GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.01
-        max_grad_norm = 1.0
+        pass
 
     class runner(LeggedRobotRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah"
         max_iterations = 500
-        algorithm_class_name = "PPO"
+        algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc.py b/gym/envs/mini_cheetah/mini_cheetah_osc.py
index acec0b8a..6d44e35d 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_osc.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_osc.py
@@ -16,6 +16,9 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
 
     def _init_buffers(self):
         super()._init_buffers()
+
+        self._switch = torch.zeros(self.num_envs, 1, device=self.device)
+
         self.oscillators = torch.zeros(self.num_envs, 4, device=self.device)
         self.oscillator_obs = torch.zeros(self.num_envs, 8, device=self.device)
 
@@ -158,6 +161,7 @@ def _post_decimation_step(self):
         """Update all states that are not handled in PhysX"""
         super()._post_decimation_step()
         self.grf = self._compute_grf()
+        self._update_cmd_switch()
         # self._step_oscillators()
 
     def _post_physx_step(self):
@@ -266,12 +270,16 @@ def _compute_grf(self, grf_norm=True):
         else:
             return grf
 
-    def _switch(self):
+    def _update_cmd_switch(self):
         c_vel = torch.linalg.norm(self.commands, dim=1)
-        return torch.exp(
+        self._switch = torch.exp(
             -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1))
             / self.cfg.reward_settings.switch_scale
         )
+        # return torch.exp(
+        #     -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1))
+        #     / self.cfg.reward_settings.switch_scale
+        # )
 
     def _reward_cursorial(self):
         # penalize the abad joints being away from 0
@@ -357,10 +365,10 @@ def _reward_stance_velocity(self):
         return -torch.mean(rew, dim=1)
 
     def _reward_dof_vel(self):
-        return super()._reward_dof_vel() * self._switch()
+        return super()._reward_dof_vel() * self._switch
 
     def _reward_dof_near_home(self):
-        return super()._reward_dof_near_home() * self._switch()
+        return super()._reward_dof_near_home() * self._switch
 
     def _reward_stand_still(self):
         """Penalize motion at zero commands"""
@@ -371,11 +379,11 @@ def _reward_stand_still(self):
         rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1)
         rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1)
         rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1)
-        return (rew_vel + rew_pos - rew_base_vel) * self._switch()
+        return (rew_vel + rew_pos - rew_base_vel) * self._switch
 
     def _reward_standing_torques(self):
         """Penalize torques at zero commands"""
-        return super()._reward_torques() * self._switch()
+        return super()._reward_torques() * self._switch
 
     # * gait similarity scores
     def angle_difference(self, theta1, theta2):
diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
index a08d5179..34996404 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
@@ -64,7 +64,7 @@ class control(MiniCheetahCfg.control):
         # * PD Drive parameters:
         stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0}
         damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5}
-        ctrl_frequency = 100
+        ctrl_frequency = 500
         desired_sim_frequency = 500
 
     class osc:
@@ -150,7 +150,7 @@ class reward_settings(MiniCheetahCfg.reward_settings):
         soft_dof_vel_limit = 0.9
         soft_torque_limit = 0.9
         max_contact_force = 600.0
-        base_height_target = BASE_HEIGHT_REF + 0.03
+        base_height_target = 0.3
         tracking_sigma = 0.25
         switch_scale = 0.5
 
@@ -160,14 +160,16 @@ class scaling(MiniCheetahCfg.scaling):
 
 class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg):
     seed = -1
+    runner_class_name = "OnPolicyRunner"
 
-    class policy(MiniCheetahRunnerCfg.policy):
-        actor_hidden_dims = [256, 256, 128]
-        critic_hidden_dims = [256, 256, 128]
+    class actor(MiniCheetahRunnerCfg.actor):
+        frequency = 100
+        hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-
-        actor_obs = [
+        smooth_exploration = False
+        exploration_sample_freq = 16
+        obs = [
             "base_ang_vel",
             "projected_gravity",
             "commands",
@@ -176,8 +178,26 @@ class policy(MiniCheetahRunnerCfg.policy):
             "oscillator_obs",
             "dof_pos_target",
         ]
+        normalize_obs = False
+
+        actions = ["dof_pos_target"]
+        disable_actions = False
+
+        class noise:
+            scale = 1.0
+            dof_pos_obs = 0.01
+            base_ang_vel = 0.01
+            dof_pos = 0.005
+            dof_vel = 0.005
+            lin_vel = 0.05
+            ang_vel = [0.3, 0.15, 0.4]
+            gravity_vec = 0.1
 
-        critic_obs = [
+    class critic(MiniCheetahRunnerCfg.critic):
+        hidden_dims = [256, 256, 128]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        obs = [
             "base_height",
             "base_lin_vel",
             "base_ang_vel",
@@ -189,11 +209,7 @@ class policy(MiniCheetahRunnerCfg.policy):
             "oscillators_vel",
             "dof_pos_target",
         ]
-
-        actions = ["dof_pos_target"]
-
-        class noise:
-            pass
+        normalize_obs = True
 
         class reward:
             class weights:
@@ -202,12 +218,12 @@ class weights:
                 lin_vel_z = 0.0
                 ang_vel_xy = 0.01
                 orientation = 1.0
-                torques = 5.0e-7
+                torques = 5.0e-6
                 dof_vel = 0.0
                 min_base_height = 1.5
                 collision = 0
-                action_rate = 0.01  # -0.01
-                action_rate2 = 0.001  # -0.001
+                action_rate = 0.001  # -0.01
+                action_rate2 = 0.0001  # -0.001
                 stand_still = 0.0
                 dof_pos_limits = 0.0
                 feet_contact_forces = 0.0
@@ -222,27 +238,33 @@ class weights:
                 standing_torques = 0.0  # 1.e-5
 
             class termination_weight:
-                termination = 15.0 / 100.0
+                termination = 0.15
 
     class algorithm(MiniCheetahRunnerCfg.algorithm):
-        # training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
+        # both
+        gamma = 0.99
+        lam = 0.98
+        batch_size = 2**15
+        max_gradient_steps = 24
+        # new
+        storage_size = 2**17  # new
+        batch_size = 2**15  #  new
+
         clip_param = 0.2
-        entropy_coef = 0.02
-        num_learning_epochs = 4
-        # mini batch size = num_envs*nsteps/nminibatches
-        num_mini_batches = 8
-        learning_rate = 1.0e-5
-        schedule = "adaptive"  # can be adaptive, fixed
-        discount_horizon = 1.0
-        GAE_bootstrap_horizon = 2.0
-        desired_kl = 0.01
+        learning_rate = 1.0e-3
         max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
+        entropy_coef = 0.01
+        schedule = "adaptive"  # could be adaptive, fixed
+        desired_kl = 0.01
+        lr_range = [2e-5, 1e-2]
+        lr_ratio = 1.5
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "FullSend"
         max_iterations = 500  # number of policy updates
-        algorithm_class_name = "PPO"
+        algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py
index 57e1f0e7..0084a80c 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py
@@ -17,6 +17,7 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
 
     def _init_buffers(self):
         super()._init_buffers()
+        self._switch = torch.zeros(self.num_envs, 1, device=self.device)
         self.phase = torch.zeros(
             self.num_envs, 1, dtype=torch.float, device=self.device
         )
@@ -41,6 +42,7 @@ def _post_decimation_step(self):
         self.phase_obs = torch.cat(
             (torch.sin(self.phase), torch.cos(self.phase)), dim=1
         )
+        self._update_cmd_switch()
 
     def _resample_commands(self, env_ids):
         super()._resample_commands(env_ids)
@@ -50,10 +52,20 @@ def _resample_commands(self, env_ids):
         ).squeeze(1)
         self.commands[env_ids, :3] *= (rand_ids < 0.9).unsqueeze(1)
 
-    def _switch(self):
+    def _check_terminations_and_timeouts(self):
+        """Check if environments need to be reset"""
+        contact_forces = self.contact_forces[:, self.termination_contact_indices, :]
+        self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
+        self.timed_out = self.episode_length_buf >= self.max_episode_length
+        # self.to_be_reset = self.timed_out | self.terminated
+
+    # ---
+
+    def _update_cmd_switch(self):
         c_vel = torch.linalg.norm(self.commands, dim=1)
-        return torch.exp(
-            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1
+        self._switch = torch.exp(
+            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1))
+            / self.cfg.reward_settings.switch_scale
         )
 
     def _reward_swing_grf(self):
@@ -64,7 +76,7 @@ def _reward_swing_grf(self):
         )
         ph_off = torch.lt(self.phase, torch.pi)
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
-        return -torch.sum(rew.float(), dim=1) * (1 - self._switch())
+        return -torch.sum(rew.float(), dim=1) * (1 - self._switch)
 
     def _reward_stance_grf(self):
         """Reward non-zero grf during stance (pi to 2pi)"""
@@ -75,7 +87,7 @@ def _reward_stance_grf(self):
         ph_off = torch.gt(self.phase, torch.pi)  # should this be in swing?
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
 
-        return torch.sum(rew.float(), dim=1) * (1 - self._switch())
+        return torch.sum(rew.float(), dim=1) * (1 - self._switch)
 
     def _reward_reference_traj(self):
         """REWARDS EACH LEG INDIVIDUALLY BASED ON ITS POSITION IN THE CYCLE"""
@@ -84,7 +96,7 @@ def _reward_reference_traj(self):
         error /= self.scales["dof_pos"]
         reward = (self._sqrdexp(error) - torch.abs(error) * 0.2).mean(dim=1)
         # * only when commanded velocity is higher
-        return reward * (1 - self._switch())
+        return reward * (1 - self._switch)
 
     def _get_ref(self):
         leg_frame = torch.zeros_like(self.torques)
@@ -112,10 +124,10 @@ def _reward_stand_still(self):
         rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1)
         rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1)
         rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1)
-        return (rew_vel + rew_pos - rew_base_vel) * self._switch()
+        return (rew_vel + rew_pos - rew_base_vel) * self._switch
 
     def _reward_tracking_lin_vel(self):
         """Tracking linear velocity commands (xy axes)"""
         # just use lin_vel?
         reward = super()._reward_tracking_lin_vel()
-        return reward * (1 - self._switch())
+        return reward * (1 - self._switch)
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 41bb634f..37c52919 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -26,7 +26,7 @@ class control(MiniCheetahCfg.control):
         stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0}
         damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5}
         gait_freq = 3.0
-        ctrl_frequency = 100
+        ctrl_frequency = 500
         desired_sim_frequency = 500
 
     class commands(MiniCheetahCfg.commands):
@@ -60,6 +60,7 @@ class reward_settings(MiniCheetahCfg.reward_settings):
         max_contact_force = 600.0
         base_height_target = 0.3
         tracking_sigma = 0.25
+        switch_scale = 0.1
 
     class scaling(MiniCheetahCfg.scaling):
         pass
@@ -67,15 +68,17 @@ class scaling(MiniCheetahCfg.scaling):
 
 class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg):
     seed = -1
-    runner_class_name = "MyRunner"
+    runner_class_name = "OnPolicyRunner"
 
-    class policy(MiniCheetahRunnerCfg.policy):
-        actor_hidden_dims = [256, 256, 128]
-        critic_hidden_dims = [256, 256, 128]
+    class actor(MiniCheetahRunnerCfg.actor):
+        frequency = 100
+        hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        layer_norm = [True, True, False]
         activation = "elu"
-
-        actor_obs = [
+        smooth_exploration = False
+        exploration_sample_freq = 16
+        obs = [
             "base_ang_vel",
             "projected_gravity",
             "commands",
@@ -83,8 +86,29 @@ class policy(MiniCheetahRunnerCfg.policy):
             "dof_vel",
             "phase_obs",
         ]
+        normalize_obs = False
+
+        actions = ["dof_pos_target"]
+        disable_actions = False
+
+        class noise:
+            scale = 1.0
+            dof_pos_obs = 0.01
+            base_ang_vel = 0.01
+            dof_pos = 0.005
+            dof_vel = 0.005
+            lin_vel = 0.05
+            ang_vel = [0.3, 0.15, 0.4]
+            gravity_vec = 0.1
+
+    class critic(MiniCheetahRunnerCfg.critic):
+        hidden_dims = [256, 256, 128]
+        layer_norm = [True, True, False]
+        dropouts = [0.1, 0.0, 0.0]
 
-        critic_obs = [
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        obs = [
             "base_height",
             "base_lin_vel",
             "base_ang_vel",
@@ -95,12 +119,7 @@ class policy(MiniCheetahRunnerCfg.policy):
             "phase_obs",
             "dof_pos_target",
         ]
-
-        actions = ["dof_pos_target"]
-        disable_actions = False
-
-        class noise:
-            pass
+        normalize_obs = False
 
         class reward:
             class weights:
@@ -109,49 +128,46 @@ class weights:
                 lin_vel_z = 0.0
                 ang_vel_xy = 0.01
                 orientation = 1.0
-                torques = 5.0e-7
+                torques = 5.0e-8
                 dof_vel = 0.0
                 min_base_height = 1.5
                 collision = 0.0
-                action_rate = 0.01
-                action_rate2 = 0.001
+                action_rate = 1e-5
+                action_rate2 = 1e-6
                 stand_still = 0.0
                 dof_pos_limits = 0.0
                 feet_contact_forces = 0.0
                 dof_near_home = 0.0
-                reference_traj = 1.5
+                reference_traj = 0.0
                 swing_grf = 1.5
                 stance_grf = 1.5
 
-            class pbrs_weights:
-                reference_traj = 0.0
-                swing_grf = 0.0
-                stance_grf = 0.0
-                min_base_height = 0.0
-                orientation = 0.0
-
             class termination_weight:
                 termination = 0.15
 
     class algorithm(MiniCheetahRunnerCfg.algorithm):
-        # training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
+        # both
+        gamma = 0.99
+        lam = 0.95
+        # shared
+        batch_size = 2 * 4096  # use all the data
+        max_gradient_steps = 50
+
         clip_param = 0.2
+        learning_rate = 1.0e-3
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
         entropy_coef = 0.01
-        num_learning_epochs = 6
-        # mini batch size = num_envs*nsteps/nminibatches
-        num_mini_batches = 4
-        learning_rate = 5.0e-5
-        schedule = "adaptive"  # can be adaptive, fixed
-        discount_horizon = 1.0  # [s]
-        GAE_bootstrap_horizon = 1.0  # [s]
+        schedule = "adaptive"  # could be adaptive, fixed
         desired_kl = 0.01
-        max_grad_norm = 1.0
+        lr_range = [2e-5, 1e-2]
+        lr_ratio = 1.5
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
         max_iterations = 500  # number of policy updates
-        algorithm_class_name = "PPO"
-        num_steps_per_env = 32
+        algorithm_class_name = "PPO2"
+        num_steps_per_env = 20  # deprecate
diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py
index 86a113f4..5ad7f3b5 100644
--- a/gym/envs/mit_humanoid/humanoid_running_config.py
+++ b/gym/envs/mit_humanoid/humanoid_running_config.py
@@ -142,7 +142,7 @@ class asset(LeggedRobotCfg.asset):
         #         +'humanoid_fixed_arms_full.urdf')
         file = (
             "{LEGGED_GYM_ROOT_DIR}/resources/robots/"
-            + "mit_humanoid/urdf/humanoid_F_sf.urdf"
+            + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf"
         )
         keypoints = ["base"]
         end_effectors = ["left_foot", "right_foot"]
@@ -165,7 +165,6 @@ class asset(LeggedRobotCfg.asset):
 
         fix_base_link = False
         disable_gravity = False
-        disable_actions = False
         disable_motors = False
 
         # (1: disable, 0: enable...bitwise filter)
@@ -201,15 +200,14 @@ class HumanoidRunningRunnerCfg(LeggedRobotRunnerCfg):
     do_wandb = True
     seed = -1
 
-    class policy(LeggedRobotRunnerCfg.policy):
+    class actor(LeggedRobotRunnerCfg.actor):
         init_noise_std = 1.0
-        actor_hidden_dims = [256, 256, 256]
-        critic_hidden_dims = [256, 256, 256]
+        hidden_dims = [256, 256, 256]
         # (elu, relu, selu, crelu, lrelu, tanh, sigmoid)
         activation = "elu"
         normalize_obs = True  # True, False
 
-        actor_obs = [
+        obs = [
             "base_height",
             "base_lin_vel",
             "base_ang_vel",
@@ -221,10 +219,8 @@ class policy(LeggedRobotRunnerCfg.policy):
             "in_contact",
         ]
 
-        critic_obs = actor_obs
-
         actions = ["dof_pos_target_legs"]
-
+        disable_actions = False
         add_noise = True
         noise_level = 1.0  # scales other values
 
@@ -237,6 +233,24 @@ class noise:
             dof_vel = 0.01
             in_contact = 0.1
 
+    class critic:
+        hidden_dims = [256, 256, 256]
+        # (elu, relu, selu, crelu, lrelu, tanh, sigmoid)
+        activation = "elu"
+        normalize_obs = True  # True, False
+
+        obs = [
+            "base_height",
+            "base_lin_vel",
+            "base_ang_vel",
+            "projected_gravity",
+            "commands",
+            "phase_obs",
+            "dof_pos_legs",
+            "dof_vel_legs",
+            "in_contact",
+        ]
+
         class reward:
             class weights:
                 # * Behavioral rewards * #
@@ -259,26 +273,14 @@ class termination_weight:
                 termination = 1
 
     class algorithm(LeggedRobotRunnerCfg.algorithm):
-        # algorithm training hyperparameters
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
-        clip_param = 0.2
-        entropy_coef = 0.01
-        num_learning_epochs = 5
-        num_mini_batches = 4  # minibatch size = num_envs*nsteps/nminibatches
-        learning_rate = 1.0e-5
-        schedule = "adaptive"  # could be adaptive, fixed
-        gamma = 0.999
-        lam = 0.99
-        desired_kl = 0.01
-        max_grad_norm = 1.0
+        pass
 
     class runner(LeggedRobotRunnerCfg.runner):
         policy_class_name = "ActorCritic"
-        algorithm_class_name = "PPO"
-        num_steps_per_env = 24
+        algorithm_class_name = "PPO2"
+        num_steps_per_env = 32
         max_iterations = 1000
-        run_name = "ICRA2023"
+        run_name = "HumanoidRunning"
         experiment_name = "HumanoidLocomotion"
         save_interval = 50
         plot_input_gradients = False
diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py
new file mode 100644
index 00000000..86d37321
--- /dev/null
+++ b/gym/envs/mit_humanoid/lander.py
@@ -0,0 +1,127 @@
+import torch
+
+# from gym.envs.base.legged_robot import LeggedRobot
+from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid
+from isaacgym.torch_utils import torch_rand_float
+
+
+class Lander(MIT_Humanoid):
+    def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
+        super().__init__(gym, sim, cfg, sim_params, sim_device, headless)
+
+    def _resample_commands(self, env_ids):
+        """Randommly select commands of some environments
+
+        Args:
+            env_ids (List[int]): Environments ids for which new commands are needed
+        """
+        if len(env_ids) == 0:
+            return
+        super()._resample_commands(env_ids)
+        # * with 75% chance, reset to 0
+        self.commands[env_ids, :] *= (
+            torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+            < 0.25
+        ).unsqueeze(1)
+
+    def _check_terminations_and_timeouts(self):
+        """Check if environments need to be reset"""
+        contact_forces = self.contact_forces[:, self.termination_contact_indices, :]
+        self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
+        self.timed_out = self.episode_length_buf >= self.max_episode_length
+        # self.to_be_reset = self.timed_out | self.terminated
+
+    # --- rewards ---
+
+    def _switch(self, mode=None):
+        c_vel = torch.linalg.norm(self.commands, dim=1)
+        switch = torch.exp(
+            -torch.square(
+                torch.max(
+                    torch.zeros_like(c_vel),
+                    c_vel - self.cfg.reward_settings.switch_scale,
+                )
+            )
+            / self.cfg.reward_settings.switch_scale
+        )
+        if mode is None or mode == "stand":
+            return switch
+        elif mode == "move":
+            return 1 - switch
+
+    def _reward_lin_vel_xy(self):
+        return torch.exp(
+            -torch.linalg.norm(self.commands[:, :2] - self.base_lin_vel[:, :2], dim=1)
+        )
+
+    def _reward_lin_vel_z(self):
+        # Penalize z axis base linear velocity w. squared exp
+        return self._sqrdexp(self.base_lin_vel[:, 2] / self.scales["base_lin_vel"])
+
+    def _reward_orientation(self):
+        # Penalize non flat base orientation
+        return torch.sum(
+            self._sqrdexp(torch.square(self.projected_gravity[:, :2])), dim=1
+        )
+
+    def _reward_min_base_height(self):
+        """Squared exponential saturating at base_height target"""
+        error = self.base_height - self.cfg.reward_settings.base_height_target
+        error = torch.clamp(error, max=0, min=None).flatten()
+        return self._sqrdexp(error)
+
+    def _reward_tracking_lin_vel(self):
+        """Tracking of linear velocity commands (xy axes)"""
+        # just use lin_vel?
+        error = self.commands[:, :2] - self.base_lin_vel[:, :2]
+        # * scale by (1+|cmd|): if cmd=0, no scaling.
+        error *= 1.0 / (1.0 + torch.abs(self.commands[:, :2]))
+        return torch.mean(self._sqrdexp(error), dim=1)
+
+    def _reward_dof_vel(self):
+        # Penalize dof velocities
+        return torch.mean(
+            self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1
+        ) * self._switch("stand")
+
+    def _reward_dof_near_home(self):
+        return self._sqrdexp(
+            (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"]
+        ).mean(dim=1)
+
+    def _reward_stand_still(self):
+        """Penalize motion at zero commands"""
+        # * normalize angles so we care about being within 5 deg
+        rew_pos = torch.mean(
+            self._sqrdexp((self.dof_pos - self.default_dof_pos) / torch.pi * 36), dim=1
+        )
+        rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1)
+        rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1)
+        rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1)
+        return rew_vel + rew_pos - rew_base_vel * self._switch("stand")
+
+    def _compute_grf(self, grf_norm=True):
+        grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
+        if grf_norm:
+            return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0)
+        else:
+            return grf
+
+    def smooth_sqr_wave(self, phase, sigma=0.2):  # sigma=0 is step function
+        return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5
+
+    def _reward_hips_forward(self):
+        # reward hip motors for pointing forward
+        hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1)
+        hip_yaw_abad -= torch.cat(
+            (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1
+        )
+        hip_yaw_abad /= torch.cat(
+            (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7])
+        )
+        return (hip_yaw_abad).pow(2).mean(dim=1)
+        # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1)
+
+    def _reward_power(self):
+        power = self.torques * self.dof_vel
+        return power.pow(2).mean(dim=1)
diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py
new file mode 100644
index 00000000..86f3329f
--- /dev/null
+++ b/gym/envs/mit_humanoid/lander_config.py
@@ -0,0 +1,328 @@
+from gym.envs.base.legged_robot_config import (
+    LeggedRobotCfg,
+    LeggedRobotRunnerCfg,
+)
+
+BASE_HEIGHT_REF = 0.80
+
+
+class LanderCfg(LeggedRobotCfg):
+    class env(LeggedRobotCfg.env):
+        num_envs = 4096
+        num_actuators = 18
+        episode_length_s = 10  # episode length in seconds
+
+        sampled_history_length = 3  # n samples
+        sampled_history_frequency = 10  # [Hz]
+
+    class terrain(LeggedRobotCfg.terrain):
+        pass
+
+    class init_state(LeggedRobotCfg.init_state):
+        # * default setup chooses how the initial conditions are chosen.
+        # * "reset_to_basic" = a single position with added randomized noise.
+        # * "reset_to_range" = a range of joint positions and velocities.
+        # * "reset_to_traj" = feed in a trajectory to sample from.
+        reset_mode = "reset_to_range"
+
+        default_joint_angles = {
+            "hip_yaw": 0.0,
+            "hip_abad": 0.0,
+            "hip_pitch": -0.667751,
+            "knee": 1.4087,
+            "ankle": -0.708876,
+            "shoulder_pitch": 0.0,
+            "shoulder_abad": 0.0,
+            "shoulder_yaw": 0.0,
+            "elbow": -1.25,
+        }
+
+        # * default COM for basic initialization
+        pos = [0.0, 0.0, 0.6]  # x,y,z [m]
+        rot = [0.0, 0.0, 0.0, 1.0]  # x,y,z,w [quat]
+        lin_vel = [0.0, 0.0, 0.0]  # x,y,z [m/s]
+        ang_vel = [0.0, 0.0, 0.0]  # x,y,z [rad/s]
+
+        # * initialization for random range setup
+
+        dof_pos_range = {
+            "hip_yaw": [-0.0, 0.0],
+            "hip_abad": [-0.0, 0.0],
+            "hip_pitch": [-0.667751, -0.667751],
+            "knee": [1.4087, 1.4087],
+            "ankle": [-0.708876, -0.708876],
+            "shoulder_pitch": [0.0, 0.0],
+            "shoulder_abad": [0.0, 0.0],
+            "shoulder_yaw": [0.0, 0.0],
+            "elbow": [0, 0],
+        }
+        dof_vel_range = {
+            "hip_yaw": [-0.1, 0.1],
+            "hip_abad": [-0.1, 0.1],
+            "hip_pitch": [-0.1, 0.1],
+            "knee": [-0.1, 0.1],
+            "ankle": [-0.1, 0.1],
+            "shoulder_pitch": [0.0, 0.0],
+            "shoulder_abad": [0.0, 0.0],
+            "shoulder_yaw": [0.0, 0.0],
+            "elbow": [0.0, 0.0],
+        }
+
+        root_pos_range = [
+            [0.0, 0.0],  # x
+            [0.0, 0.0],  # y
+            [0.64, 1.5],  # z
+            [-0.1, 0.1],  # roll
+            [-0.1, 0.1],  # pitch
+            [-0.1, 0.1],
+        ]  # yaw
+
+        root_vel_range = [
+            [-0.75, 2.75],  # x
+            [-0.55, 0.55],  # y
+            [-2.5, 0.25],  # z
+            [-0.35, 0.35],  # roll
+            [-0.35, 0.35],  # pitch
+            [-0.35, 0.35],  # yaw
+        ]
+
+    class control(LeggedRobotCfg.control):
+        # * PD Drive parameters:
+        stiffness = {
+            "hip_yaw": 30.0,
+            "hip_abad": 30.0,
+            "hip_pitch": 30.0,
+            "knee": 30.0,
+            "ankle": 30.0,
+            "shoulder_pitch": 40.0,
+            "shoulder_abad": 40.0,
+            "shoulder_yaw": 40.0,
+            "elbow": 50.0,
+        }  # [N*m/rad]
+        damping = {
+            "hip_yaw": 2.0,
+            "hip_abad": 2.0,
+            "hip_pitch": 2.0,
+            "knee": 2.0,
+            "ankle": 2.0,
+            "shoulder_pitch": 2.0,
+            "shoulder_abad": 2.0,
+            "shoulder_yaw": 2.0,
+            "elbow": 1.0,
+        }  # [N*m*s/rad]
+
+        ctrl_frequency = 500
+        desired_sim_frequency = 500
+
+    # class oscillator:
+    #     base_frequency = 3.0  # [Hz]
+
+    class commands:
+        resampling_time = 10.0  # time before command are changed[s]
+
+        class ranges:
+            lin_vel_x = [-2.0, 2.0]  # min max [m/s] [-0.75, 0.75]
+            lin_vel_y = 0.3  # max [m/s]
+            yaw_vel = 1.0  # max [rad/s]
+
+    class push_robots:
+        toggle = True
+        interval_s = 2
+        max_push_vel_xy = 0.5
+        push_box_dims = [0.1, 0.1, 0.3]  # x,y,z [m]
+
+    class domain_rand:
+        randomize_friction = True
+        friction_range = [0.5, 1.25]
+        randomize_base_mass = True
+        added_mass_range = [-1.0, 1.0]
+
+    class asset(LeggedRobotCfg.asset):
+        file = (
+            "{LEGGED_GYM_ROOT_DIR}/resources/robots/"
+            + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf"
+        )
+        # foot_collisionbox_names = ["foot"]
+        foot_name = "foot"
+        penalize_contacts_on = ["arm", "hand", "shoulder"]
+        terminate_after_contacts_on = ["base"]
+        flip_visual_attachments = False
+        self_collisions = 0  # 1 to disagble, 0 to enable...bitwise filter
+        collapse_fixed_joints = False
+        fix_base_link = False
+        disable_gravity = False
+        disable_motors = False
+        total_mass = 25.0
+
+    class reward_settings(LeggedRobotCfg.reward_settings):
+        soft_dof_pos_limit = 0.8
+        soft_dof_vel_limit = 0.8
+        soft_torque_limit = 0.8
+        max_contact_force = 1500.0
+        base_height_target = BASE_HEIGHT_REF
+        tracking_sigma = 0.25
+
+        # a smooth switch based on |cmd| (commanded velocity).
+        switch_scale = 0.5
+        switch_threshold = 0.2
+
+    class scaling(LeggedRobotCfg.scaling):
+        base_ang_vel = 2.5
+        base_lin_vel = 1.5
+        commands = 1
+        base_height = BASE_HEIGHT_REF
+        dof_pos = [
+            0.1,
+            0.2,
+            0.8,
+            0.8,
+            0.8,
+            0.1,
+            0.2,
+            0.8,
+            0.8,
+            0.8,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+        ]
+        # # * Action scales
+        dof_pos_target = dof_pos
+        dof_vel = [
+            0.5,
+            1.0,
+            4.0,
+            4.0,
+            2.0,
+            0.5,
+            1.0,
+            4.0,
+            4.0,
+            2.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+        ]
+        dof_pos_history = 3 * dof_pos
+
+
+class LanderRunnerCfg(LeggedRobotRunnerCfg):
+    seed = -1
+    runner_class_name = "MyRunner"
+
+    class actor(LeggedRobotRunnerCfg.actor):
+        frequency = 100
+        init_noise_std = 1.0
+        hidden_dims = [512, 256, 128]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = ["elu", "elu", "tanh"]
+        layer_norm = True
+        smooth_exploration = False
+
+        obs = [
+            "base_height",
+            "base_lin_vel",
+            "base_ang_vel",
+            "projected_gravity",
+            "commands",
+            "dof_pos_obs",
+            "dof_vel",
+            "dof_pos_history",
+            "sampled_history_dof_pos",
+            "sampled_history_dof_vel",
+            "sampled_history_dof_pos_target",
+        ]
+        normalize_obs = False
+
+        actions = ["dof_pos_target"]
+        disable_actions = False
+
+        class noise:
+            dof_pos = 0.005
+            dof_vel = 0.05
+            base_ang_vel = 0.025
+            base_lin_vel = 0.025
+            projected_gravity = 0.01
+            feet_contact_state = 0.025
+
+    class critic(LeggedRobotRunnerCfg.critic):
+        hidden_dims = [512, 256, 128]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        layer_norm = True
+
+        obs = [
+            # "base_height",
+            "base_lin_vel",
+            "base_ang_vel",
+            "projected_gravity",
+            "commands",
+            "dof_pos_obs",
+            "dof_vel",
+            "dof_pos_history",
+            "sampled_history_dof_pos",
+            "sampled_history_dof_vel",
+            "sampled_history_dof_pos_target",
+        ]
+        normalize_obs = False
+
+        class reward:
+            class weights:
+                torques = 5.0e-5
+                power = 1e-6  # 1.0e-2
+                min_base_height = 1.5
+                lin_vel_xy = 1.0
+                action_rate = 1e-2
+                action_rate2 = 1e-3
+                lin_vel_z = 0.0
+                ang_vel_xy = 0.0
+                dof_vel = 0.5
+                dof_pos_limits = 0.25
+                dof_near_home = 0.75
+                hips_forward = 0.0
+                collision = 1.0
+
+            class termination_weight:
+                termination = 1.0
+
+    class algorithm(LeggedRobotRunnerCfg.algorithm):
+        # both
+        gamma = 0.99
+        lam = 0.95
+        # shared
+        batch_size = 2**15
+        max_gradient_steps = 24
+        # new
+        storage_size = 2**17  # new
+        batch_size = 2**15  #  new
+
+        clip_param = 0.2
+        learning_rate = 1.0e-3
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
+        entropy_coef = 0.01
+        schedule = "adaptive"  # could be adaptive, fixed
+        desired_kl = 0.01
+        lr_range = [1e-5, 1e-2]
+        lr_ratio = 1.5
+
+    class runner(LeggedRobotRunnerCfg.runner):
+        policy_class_name = "ActorCritic"
+        algorithm_class_name = "PPO2"
+        num_steps_per_env = 24
+        max_iterations = 500
+        run_name = "lander"
+        experiment_name = "Humanoid"
+        save_interval = 50
diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py
index 74e35d5a..a2e082e8 100644
--- a/gym/envs/mit_humanoid/mit_humanoid.py
+++ b/gym/envs/mit_humanoid/mit_humanoid.py
@@ -1,7 +1,6 @@
 import torch
 
 from gym.envs.base.legged_robot import LeggedRobot
-from .jacobian import _apply_coupling
 
 
 class MIT_Humanoid(LeggedRobot):
@@ -10,25 +9,148 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
 
     def _init_buffers(self):
         super()._init_buffers()
+        self._init_sampled_history_buffers()
 
+    def _init_sampled_history_buffers(self):
+        self.sampled_history_dof_pos_target = torch.zeros(
+            (self.num_envs, self.num_dof * self.cfg.env.sampled_history_length),
+            device=self.device,
+        )
+        self.sampled_history_dof_pos = torch.zeros(
+            self.num_envs,
+            self.num_dof * self.cfg.env.sampled_history_length,
+            device=self.device,
+        )
+        self.sampled_history_dof_vel = torch.zeros(
+            self.num_envs,
+            self.num_dof * self.cfg.env.sampled_history_length,
+            device=self.device,
+        )
+        self.sampled_history_counter = torch.zeros(
+            self.num_envs, dtype=int, device=self.device
+        )
+        self.sampled_history_threshold = int(
+            self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency
+        )
+        self.J = torch.eye(self.num_dof).to(self.device)
+        self.J[4, 3] = 1
+        self.J[9, 8] = 1
+        self.J_inv_T = torch.inverse(self.J.T)
+
+    def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff):
+        # Create a Jacobian matrix and move it to the same device as input tensors
+
+        # Perform transformations using Jacobian
+        q = torch.matmul(q, self.J.T)
+        qd = torch.matmul(qd, self.J.T)
+        q_des = torch.matmul(q_des, self.J.T)
+        qd_des = torch.matmul(qd_des, self.J.T)
+
+        # Compute feed-forward torques
+        tau_ff = torch.matmul(self.J_inv_T, tau_ff.T).T
+
+        # Compute kp and kd terms
+        kp = torch.diagonal(
+            torch.matmul(
+                torch.matmul(self.J_inv_T, torch.diag_embed(kp, dim1=-2, dim2=-1)),
+                self.J_inv_T.T,
+            ),
+            dim1=-2,
+            dim2=-1,
+        )
+
+        kd = torch.diagonal(
+            torch.matmul(
+                torch.matmul(self.J_inv_T, torch.diag_embed(kd, dim1=-2, dim2=-1)),
+                self.J_inv_T.T,
+            ),
+            dim1=-2,
+            dim2=-1,
+        )
+
+        # Compute torques
+        torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff
+        torques = torch.matmul(torques, self.J)
+
+        return torques
+
+    def _reset_system(self, env_ids):
+        if len(env_ids) == 0:
+            return
+        super()._reset_system(env_ids)
+        self._reset_sampled_history_buffers(env_ids)
+        return
+
+    def _reset_sampled_history_buffers(self, ids):
+        n = self.cfg.env.sampled_history_length
+        self.sampled_history_dof_pos_target[ids] = self.dof_pos_target[ids].tile(n)
+        self.sampled_history_dof_pos[ids] = self.dof_pos_target[ids].tile(n)
+        self.sampled_history_dof_vel[ids] = self.dof_pos_target[ids].tile(n)
+
+    # compute_torques accounting for coupling, and filtering torques
     def _compute_torques(self):
-        self.desired_pos_target = self.dof_pos_target + self.default_dof_pos
-        q = self.dof_pos.clone()
-        qd = self.dof_vel.clone()
-        q_des = self.desired_pos_target.clone()
-        qd_des = self.dof_vel_target.clone()
-        tau_ff = self.tau_ff.clone()
-        kp = self.p_gains.clone()
-        kd = self.d_gains.clone()
-
-        if self.cfg.asset.apply_humanoid_jacobian:
-            torques = _apply_coupling(q, qd, q_des, qd_des, kp, kd, tau_ff)
-        else:
-            torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff
+        torques = self._apply_coupling(
+            self.dof_pos,
+            self.dof_vel,
+            self.dof_pos_target + self.default_dof_pos,
+            self.dof_vel_target,
+            self.p_gains,
+            self.d_gains,
+            self.tau_ff,
+        )
+        return torques.clip(-self.torque_limits, self.torque_limits)
+
+    def _post_decimation_step(self):
+        super()._post_decimation_step()
+        self._update_sampled_history_buffers()
+
+    def _update_sampled_history_buffers(self):
+        self.sampled_history_counter += 1
+
+        ids = torch.nonzero(
+            self.sampled_history_counter == self.sampled_history_threshold,
+            as_tuple=False,
+        ).flatten()
+
+        self.sampled_history_dof_pos_target[ids] = torch.roll(
+            self.sampled_history_dof_pos_target[ids], self.num_dof, dims=1
+        )  # check
+        self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[
+            ids
+        ]
+        self.sampled_history_dof_pos[ids] = torch.roll(
+            self.sampled_history_dof_pos[ids], self.num_dof, dims=1
+        )  # check
+        self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids]
+        self.sampled_history_dof_vel[ids] = torch.roll(
+            self.sampled_history_dof_vel[ids], self.num_dof, dims=1
+        )  # check
+        self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids]
 
-        torques = torch.clip(torques, -self.torque_limits, self.torque_limits)
+        self.sampled_history_counter[ids] = 0
 
-        return torques.view(self.torques.shape)
+    # --- rewards ---
+
+    def _switch(self, mode=None):
+        c_vel = torch.linalg.norm(self.commands, dim=1)
+        switch = torch.exp(
+            -torch.square(
+                torch.max(
+                    torch.zeros_like(c_vel),
+                    c_vel - self.cfg.reward_settings.switch_scale,
+                )
+            )
+            / self.cfg.reward_settings.switch_scale
+        )
+        if mode is None or mode == "stand":
+            return switch
+        elif mode == "move":
+            return 1 - switch
+
+    def _reward_lin_vel_xy(self):
+        return torch.exp(
+            -torch.linalg.norm(self.commands[:, :2] - self.base_lin_vel[:, :2], dim=1)
+        )
 
     def _reward_lin_vel_z(self):
         # Penalize z axis base linear velocity w. squared exp
@@ -56,12 +178,44 @@ def _reward_tracking_lin_vel(self):
 
     def _reward_dof_vel(self):
         # Penalize dof velocities
-        return torch.mean(self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1)
+        return torch.mean(
+            self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1
+        ) * self._switch("stand")
 
     def _reward_dof_near_home(self):
-        return torch.mean(
-            self._sqrdexp(
-                (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos_obs"]
-            ),
-            dim=1,
+        return self._sqrdexp(
+            (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"]
+        ).mean(dim=1)
+
+    def _reward_stand_still(self):
+        """Penalize motion at zero commands"""
+        # * normalize angles so we care about being within 5 deg
+        rew_pos = torch.mean(
+            self._sqrdexp((self.dof_pos - self.default_dof_pos) / torch.pi * 36), dim=1
+        )
+        rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1)
+        rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1)
+        rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1)
+        return rew_vel + rew_pos - rew_base_vel * self._switch("stand")
+
+    def _compute_grf(self, grf_norm=True):
+        grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
+        if grf_norm:
+            return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0)
+        else:
+            return grf
+
+    def smooth_sqr_wave(self, phase, sigma=0.2):  # sigma=0 is step function
+        return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5
+
+    def _reward_hips_forward(self):
+        # reward hip motors for pointing forward
+        hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1)
+        hip_yaw_abad -= torch.cat(
+            (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1
+        )
+        hip_yaw_abad /= torch.cat(
+            (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7])
         )
+        return (hip_yaw_abad).pow(2).mean(dim=1)
+        # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1)
diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py
index 3439e58c..861f2826 100644
--- a/gym/envs/mit_humanoid/mit_humanoid_config.py
+++ b/gym/envs/mit_humanoid/mit_humanoid_config.py
@@ -3,14 +3,17 @@
     LeggedRobotRunnerCfg,
 )
 
+BASE_HEIGHT_REF = 0.80
+
 
 class MITHumanoidCfg(LeggedRobotCfg):
     class env(LeggedRobotCfg.env):
         num_envs = 4096
-        num_observations = 49 + 3 * 18  # 121
         num_actuators = 18
-        episode_length_s = 100  # episode length in seconds
-        num_privileged_obs = num_observations
+        episode_length_s = 5  # episode length in seconds
+
+        sampled_history_length = 3  # n samples
+        sampled_history_frequency = 10  # [Hz]
 
     class terrain(LeggedRobotCfg.terrain):
         pass
@@ -25,17 +28,17 @@ class init_state(LeggedRobotCfg.init_state):
         default_joint_angles = {
             "hip_yaw": 0.0,
             "hip_abad": 0.0,
-            "hip_pitch": -0.4,
-            "knee": 0.9,
-            "ankle": -0.45,
+            "hip_pitch": -0.667751,
+            "knee": 1.4087,
+            "ankle": -0.708876,
             "shoulder_pitch": 0.0,
             "shoulder_abad": 0.0,
             "shoulder_yaw": 0.0,
-            "elbow": 0.0,
+            "elbow": -1.25,
         }
 
         # * default COM for basic initialization
-        pos = [0.0, 0.0, 0.66]  # x,y,z [m]
+        pos = [0.0, 0.0, 0.6]  # x,y,z [m]
         rot = [0.0, 0.0, 0.0, 1.0]  # x,y,z,w [quat]
         lin_vel = [0.0, 0.0, 0.0]  # x,y,z [m/s]
         ang_vel = [0.0, 0.0, 0.0]  # x,y,z [rad/s]
@@ -43,22 +46,22 @@ class init_state(LeggedRobotCfg.init_state):
         # * initialization for random range setup
 
         dof_pos_range = {
-            "hip_yaw": [0.0, 0.0],
-            "hip_abad": [0.0, 0.0],
-            "hip_pitch": [-0.29, -0.25],
-            "knee": [0.67, 0.71],
-            "ankle": [-0.43, -0.39],
+            "hip_yaw": [-0.0, 0.0],
+            "hip_abad": [-0.0, 0.0],
+            "hip_pitch": [-0.667751, -0.667751],
+            "knee": [1.4087, 1.4087],
+            "ankle": [-0.708876, -0.708876],
             "shoulder_pitch": [0.0, 0.0],
             "shoulder_abad": [0.0, 0.0],
             "shoulder_yaw": [0.0, 0.0],
-            "elbow": [0.0, 0.0],
+            "elbow": [0, 0],
         }
         dof_vel_range = {
-            "hip_yaw": [-0.0, 0.1],
-            "hip_abad": [-0.0, 0.1],
-            "hip_pitch": [-0.1, -0.1],
-            "knee": [-0.05, 0.05],
-            "ankle": [-0.05, 0.05],
+            "hip_yaw": [-0.1, 0.1],
+            "hip_abad": [-0.1, 0.1],
+            "hip_pitch": [-0.1, 0.1],
+            "knee": [-0.1, 0.1],
+            "ankle": [-0.1, 0.1],
             "shoulder_pitch": [0.0, 0.0],
             "shoulder_abad": [0.0, 0.0],
             "shoulder_yaw": [0.0, 0.0],
@@ -68,20 +71,20 @@ class init_state(LeggedRobotCfg.init_state):
         root_pos_range = [
             [0.0, 0.0],  # x
             [0.0, 0.0],  # y
-            [0.7, 0.72],  # z
+            [0.64, 0.7],  # z
             [-0.1, 0.1],  # roll
             [-0.1, 0.1],  # pitch
             [-0.1, 0.1],
         ]  # yaw
 
         root_vel_range = [
-            [-0.1, 0.1],  # x
-            [-0.1, 0.1],  # y
-            [-0.1, 0.1],  # z
-            [-0.1, 0.1],  # roll
-            [-0.1, 0.1],  # pitch
-            [-0.1, 0.1],
-        ]  # yaw
+            [-0.75, 2.75],  # x
+            [-0.55, 0.55],  # y
+            [-0.35, 0.1],  # z
+            [-0.35, 0.35],  # roll
+            [-0.35, 0.35],  # pitch
+            [-0.35, 0.35],  # yaw
+        ]
 
     class control(LeggedRobotCfg.control):
         # * PD Drive parameters:
@@ -94,39 +97,44 @@ class control(LeggedRobotCfg.control):
             "shoulder_pitch": 40.0,
             "shoulder_abad": 40.0,
             "shoulder_yaw": 40.0,
-            "elbow": 40.0,
+            "elbow": 50.0,
         }  # [N*m/rad]
         damping = {
-            "hip_yaw": 5.0,
-            "hip_abad": 5.0,
-            "hip_pitch": 5.0,
-            "knee": 5.0,
-            "ankle": 5.0,
-            "shoulder_pitch": 5.0,
-            "shoulder_abad": 5.0,
-            "shoulder_yaw": 5.0,
-            "elbow": 5.0,
+            "hip_yaw": 2.0,
+            "hip_abad": 2.0,
+            "hip_pitch": 2.0,
+            "knee": 2.0,
+            "ankle": 2.0,
+            "shoulder_pitch": 2.0,
+            "shoulder_abad": 2.0,
+            "shoulder_yaw": 2.0,
+            "elbow": 1.0,
         }  # [N*m*s/rad]
 
         ctrl_frequency = 100
-        desired_sim_frequency = 800
+        desired_sim_frequency = 500
+
+        filter_gain = 0.1586  # 1: no filtering, 0: wall
+
+    class oscillator:
+        base_frequency = 3.0  # [Hz]
 
     class commands:
         resampling_time = 10.0  # time before command are changed[s]
 
         class ranges:
-            lin_vel_x = [-1.0, 1.0]  # min max [m/s]
-            lin_vel_y = 1.0  # max [m/s]
-            yaw_vel = 1  # max [rad/s]
+            lin_vel_x = [-0.0, 4.0]  # min max [m/s] [-0.75, 0.75]
+            lin_vel_y = 0.0  # max [m/s]
+            yaw_vel = 0.0  # max [rad/s]
 
     class push_robots:
-        toggle = False
-        interval_s = 15
-        max_push_vel_xy = 0.05
-        push_box_dims = [0.1, 0.2, 0.3]  # x,y,z [m]
+        toggle = True
+        interval_s = 1
+        max_push_vel_xy = 0.5
+        push_box_dims = [0.1, 0.1, 0.3]  # x,y,z [m]
 
     class domain_rand:
-        randomize_friction = False
+        randomize_friction = True
         friction_range = [0.5, 1.25]
         randomize_base_mass = True
         added_mass_range = [-1.0, 1.0]
@@ -134,61 +142,100 @@ class domain_rand:
     class asset(LeggedRobotCfg.asset):
         file = (
             "{LEGGED_GYM_ROOT_DIR}/resources/robots/"
-            + "mit_humanoid/urdf/humanoid_R_sf.urdf"
+            + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf"
         )
+        # foot_collisionbox_names = ["foot"]
         foot_name = "foot"
-        penalize_contacts_on = ["base", "arm"]
+        penalize_contacts_on = ["arm"]
         terminate_after_contacts_on = ["base"]
-        end_effector_names = ["hand", "foot"]
+        end_effector_names = ["hand", "foot"]  # ??
         flip_visual_attachments = False
-        self_collisions = 1  # 1 to disagble, 0 to enable...bitwise filter
+        self_collisions = 0  # 1 to disagble, 0 to enable...bitwise filter
         collapse_fixed_joints = False
         # * see GymDofDriveModeFlags
         # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort)
         default_dof_drive_mode = 3
+        fix_base_link = False
         disable_gravity = False
         disable_motors = False
-        apply_humanoid_jacobian = False
+        total_mass = 25.0
 
     class reward_settings(LeggedRobotCfg.reward_settings):
-        soft_dof_pos_limit = 0.9
-        soft_dof_vel_limit = 0.9
-        soft_torque_limit = 0.9
+        soft_dof_pos_limit = 0.8
+        soft_dof_vel_limit = 0.8
+        soft_torque_limit = 0.8
         max_contact_force = 1500.0
+        base_height_target = BASE_HEIGHT_REF
+        tracking_sigma = 0.25
 
-        base_height_target = 0.65
-        tracking_sigma = 0.5
+        # a smooth switch based on |cmd| (commanded velocity).
+        switch_scale = 0.5
+        switch_threshold = 0.2
 
     class scaling(LeggedRobotCfg.scaling):
-        # * dimensionless time: sqrt(L/g) or sqrt(I/[mgL]), with I=I0+mL^2
-        virtual_leg_length = 0.65
-        dimensionless_time = (virtual_leg_length / 9.81) ** 0.5
-        base_height = virtual_leg_length
-        base_lin_vel = virtual_leg_length / dimensionless_time
-        base_ang_vel = 3.14 / dimensionless_time
-        dof_vel = 20  # ought to be roughly max expected speed.
-        height_measurements = virtual_leg_length
-
-        # todo check order of joints, create per-joint scaling
-        dof_pos = 3.14
-        dof_pos_obs = dof_pos
-        # * Action scales
+        base_ang_vel = 2.5
+        base_lin_vel = 1.5
+        commands = 1
+        base_height = BASE_HEIGHT_REF
+        dof_pos = [
+            0.1,
+            0.2,
+            0.8,
+            0.8,
+            0.8,
+            0.1,
+            0.2,
+            0.8,
+            0.8,
+            0.8,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+        ]
+        # # * Action scales
         dof_pos_target = dof_pos
-        tau_ff = 0.1
+        dof_vel = [
+            0.5,
+            1.0,
+            4.0,
+            4.0,
+            2.0,
+            0.5,
+            1.0,
+            4.0,
+            4.0,
+            2.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+        ]
+        dof_pos_history = 3 * dof_pos
 
 
 class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg):
     seed = -1
     runner_class_name = "OnPolicyRunner"
 
-    class policy(LeggedRobotRunnerCfg.policy):
+    class actor(LeggedRobotRunnerCfg.actor):
+        frequency = 100
         init_noise_std = 1.0
-        actor_hidden_dims = [512, 256, 128]
-        critic_hidden_dims = [512, 256, 128]
+        hidden_dims = [512, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        layer_norm = [True, True, False]
+        smooth_exploration = False
 
-        actor_obs = [
+        obs = [
             "base_height",
             "base_lin_vel",
             "base_ang_vel",
@@ -196,59 +243,86 @@ class policy(LeggedRobotRunnerCfg.policy):
             "commands",
             "dof_pos_obs",
             "dof_vel",
-            "dof_pos_history",
+            # "dof_pos_history",
+            "sampled_history_dof_pos",
+            "sampled_history_dof_vel",
+            "sampled_history_dof_pos_target",
         ]
-        critic_obs = actor_obs
+        normalize_obs = False
 
         actions = ["dof_pos_target"]
+        disable_actions = False
 
         class noise:
-            base_height = 0.05
-            dof_pos_obs = 0.0
-            dof_vel = 0.0
-            base_lin_vel = 0.1
-            base_ang_vel = 0.2
-            projected_gravity = 0.05
-            height_measurements = 0.1
+            dof_pos = 0.005
+            dof_vel = 0.05
+            base_ang_vel = 0.025
+            base_lin_vel = 0.025
+            projected_gravity = 0.01
+            feet_contact_state = 0.025
+
+    class critic(LeggedRobotRunnerCfg.critic):
+        hidden_dims = [512, 256, 128]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        layer_norm = [True, True, False]
+
+        obs = [
+            "base_height",
+            "base_lin_vel",
+            "base_ang_vel",
+            "projected_gravity",
+            "dof_pos_obs",
+            "dof_vel",
+            "sampled_history_dof_pos",
+            "sampled_history_dof_vel",
+            "sampled_history_dof_pos_target",
+        ]
+        normalize_obs = False
 
         class reward:
             class weights:
+                tracking_lin_vel = 4.0
                 tracking_ang_vel = 0.5
-                tracking_lin_vel = 0.5
-                orientation = 1.5
-                torques = 5.0e-6
+                # orientation = 1.0
+                torques = 5.0e-4
                 min_base_height = 1.5
-                action_rate = 0.01
-                action_rate2 = 0.001
+                action_rate = 1e-3
+                action_rate2 = 1e-3
                 lin_vel_z = 0.0
                 ang_vel_xy = 0.0
-                dof_vel = 0.0
-                stand_still = 0.0
-                dof_pos_limits = 0.0
-                dof_near_home = 0.5
+                # dof_vel = 0.25
+                # stand_still = 0.25
+                dof_pos_limits = 0.25
+                dof_near_home = 0.25
+                hips_forward = 0.0
+                walk_freq = 0.0  # 2.5
 
             class termination_weight:
                 termination = 15
 
     class algorithm(LeggedRobotRunnerCfg.algorithm):
-        # * training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
+        # both
+        gamma = 0.99
+        lam = 0.95
+        # shared
+        batch_size = 4096
+        max_gradient_steps = 48
         clip_param = 0.2
-        entropy_coef = 0.001
-        num_learning_epochs = 5
-        # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 4
-        learning_rate = 5.0e-4
+        learning_rate = 1.0e-3
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
+        entropy_coef = 0.01
         schedule = "adaptive"  # could be adaptive, fixed
-        gamma = 0.999
-        lam = 0.95
         desired_kl = 0.01
-        max_grad_norm = 1.0
+        lr_range = [1e-5, 1e-2]
+        lr_ratio = 1.5
 
     class runner(LeggedRobotRunnerCfg.runner):
         policy_class_name = "ActorCritic"
-        algorithm_class_name = "PPO"
+        algorithm_class_name = "PPO2"
         num_steps_per_env = 24
         max_iterations = 1000
         run_name = "Standing"
diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py
new file mode 100644
index 00000000..90bc60ae
--- /dev/null
+++ b/gym/envs/pendulum/pendulum.py
@@ -0,0 +1,91 @@
+from math import sqrt
+import torch
+import numpy as np
+
+from gym.envs.base.fixed_robot import FixedRobot
+
+
+class Pendulum(FixedRobot):
+    def _init_buffers(self):
+        super()._init_buffers()
+        self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device)
+
+    def _post_decimation_step(self):
+        super()._post_decimation_step()
+        self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1)
+
+    def _reset_system(self, env_ids):
+        super()._reset_system(env_ids)
+        self.dof_pos_obs[env_ids] = torch.cat(
+            [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1
+        )
+
+    def _check_terminations_and_timeouts(self):
+        super()._check_terminations_and_timeouts()
+        self.terminated = self.timed_out
+
+    def reset_to_uniform(self, env_ids):
+        grid_points = int(sqrt(self.num_envs))
+        lin_pos = torch.linspace(
+            self.dof_pos_range[0, 0],
+            self.dof_pos_range[0, 1],
+            grid_points,
+            device=self.device,
+        )
+        lin_vel = torch.linspace(
+            self.dof_vel_range[0, 0],
+            self.dof_vel_range[0, 1],
+            grid_points,
+            device=self.device,
+        )
+        grid = torch.cartesian_prod(lin_pos, lin_vel)
+        self.dof_pos[env_ids] = grid[env_ids, 0].unsqueeze(-1)
+        self.dof_vel[env_ids] = grid[env_ids, 1].unsqueeze(-1)
+
+    def _reward_theta(self):
+        theta_rwd = torch.cos(self.dof_pos[:, 0])  # no scaling
+        return self._sqrdexp(theta_rwd)
+
+    def _reward_omega(self):
+        omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"])
+        return self._sqrdexp(omega_rwd)
+
+    def _reward_equilibrium(self):
+        # theta_norm = self._normalize_theta()
+        # omega = self.dof_vel[:, 0]
+        # error = torch.stack(
+        #     [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]],
+        # dim=1)
+        # return self._sqrdexp(torch.mean(error, dim=1), 0.01)
+
+        # todo compare alternatives
+        error = torch.abs(self.dof_state)
+        error[:, 0] /= self.scales["dof_pos"]
+        error[:, 1] /= self.scales["dof_vel"]
+        return self._sqrdexp(torch.mean(error, dim=1), sigma=0.01)
+
+    def _reward_torques(self):
+        """Penalize torques"""
+        return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2)
+
+    def _reward_energy(self):
+        kinetic_energy = (
+            0.5
+            * self.cfg.asset.mass
+            * self.cfg.asset.length**2
+            * torch.square(self.dof_vel[:, 0])
+        )
+        potential_energy = (
+            self.cfg.asset.mass
+            * 9.81
+            * self.cfg.asset.length
+            * torch.cos(self.dof_pos[:, 0])
+        )
+        desired_energy = self.cfg.asset.mass * 9.81 * self.cfg.asset.length
+        energy_error = kinetic_energy + potential_energy - desired_energy
+        return self._sqrdexp(energy_error / desired_energy)
+
+    def _normalize_theta(self):
+        # normalize to range [-pi, pi]
+        theta = self.dof_pos[:, 0]
+        return ((theta + np.pi) % (2 * np.pi)) - np.pi
diff --git a/gym/envs/pendulum/pendulum_PSD_config.py b/gym/envs/pendulum/pendulum_PSD_config.py
new file mode 100644
index 00000000..083588c8
--- /dev/null
+++ b/gym/envs/pendulum/pendulum_PSD_config.py
@@ -0,0 +1,62 @@
+from gym.envs.pendulum.pendulum_SAC_config import PendulumSACCfg, PendulumSACRunnerCfg
+
+
+class PendulumPSDCfg(PendulumSACCfg):
+    class env(PendulumSACCfg.env):
+        pass
+
+    class init_state(PendulumSACCfg.init_state):
+        pass
+
+    class control(PendulumSACCfg.control):
+        pass
+
+    class asset(PendulumSACCfg.asset):
+        pass
+
+    class reward_settings(PendulumSACCfg.reward_settings):
+        pass
+
+    class scaling(PendulumSACCfg.scaling):
+        pass
+
+
+class PendulumPSDRunnerCfg(PendulumSACRunnerCfg):
+    seed = -1
+    runner_class_name = "PSACRunner"
+
+    class actor(PendulumSACRunnerCfg.actor):
+        pass
+
+    class critic(PendulumSACRunnerCfg.critic):
+        obs = ["dof_pos_obs", "dof_vel"]
+        hidden_dims = [16, 16]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        dropouts = [0.1, 0.0]
+        layer_norm = [True, True]
+        activation = "elu"
+        # TODO[lm]: Current normalization uses torch.no_grad, this should be changed
+        normalize_obs = False
+
+        # critic_class_name = "CholeskyLatent"
+        # critic_class_name = "Critic"
+        critic_class_name = "DenseSpectralLatent"
+        # * some class-specific params
+        minimize = False
+        relative_dim = 4  # 16
+        latent_dim = 4  # 18,
+        latent_hidden_dims = [16]
+        latent_activation = "elu"
+
+    class algorithm(PendulumSACRunnerCfg.algorithm):
+        alpha_lr = 1e-4
+        actor_lr = 1e-4
+        critic_lr = 1e-4
+
+    class runner(PendulumSACRunnerCfg.runner):
+        run_name = ""
+        experiment_name = "psd_pendulum"
+        algorithm_class_name = "SAC"
+        num_steps_per_env = 1
+        save_interval = 500
+        log_storage = True
diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py
new file mode 100644
index 00000000..42bf0265
--- /dev/null
+++ b/gym/envs/pendulum/pendulum_SAC_config.py
@@ -0,0 +1,103 @@
+import torch
+from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO
+from gym.envs.pendulum.pendulum_config import PendulumCfg
+
+
+class PendulumSACCfg(PendulumCfg):
+    class env(PendulumCfg.env):
+        num_envs = 1024
+        episode_length_s = 10
+
+    class init_state(PendulumCfg.init_state):
+        reset_mode = "reset_to_range"
+        default_joint_angles = {"theta": 0.0}
+        dof_pos_range = {
+            "theta": [-torch.pi, torch.pi],
+        }
+        dof_vel_range = {"theta": [-5, 5]}
+
+    class control(PendulumCfg.control):
+        ctrl_frequency = 100
+        desired_sim_frequency = 100
+
+    class asset(PendulumCfg.asset):
+        joint_damping = 0.1
+
+    class reward_settings(PendulumCfg.reward_settings):
+        tracking_sigma = 0.25
+
+    class scaling(PendulumCfg.scaling):
+        dof_vel = 5.0
+        dof_pos = 2.0 * torch.pi
+        tau_ff = 1.0
+
+
+class PendulumSACRunnerCfg(FixedRobotCfgPPO):
+    seed = -1
+    runner_class_name = "OffPolicyRunner"
+
+    class actor(FixedRobotCfgPPO.actor):
+        frequency = 10
+        latent_nn = {"hidden_dims": [128, 64], "activation": "elu", "layer_norm": True}
+        mean_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True}
+        std_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True}
+        nn_params = {"latent": latent_nn, "mean": mean_nn, "std": std_nn}
+
+        normalize_obs = False
+        obs = ["dof_pos_obs", "dof_vel"]
+        actions = ["tau_ff"]
+        disable_actions = False
+
+        class noise:
+            dof_pos = 0.0
+            dof_vel = 0.0
+
+    class critic(FixedRobotCfgPPO.critic):
+        obs = [
+            "dof_pos_obs",
+            "dof_vel",
+        ]
+        hidden_dims = [128, 64, 32]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        # TODO[lm]: Current normalization uses torch.no_grad, this should be changed
+        normalize_obs = False
+
+        class reward:
+            class weights:
+                theta = 0.0
+                omega = 0.0
+                equilibrium = 2.0
+                energy = 0.5
+                dof_vel = 0.0
+                torques = 0.025
+
+            class termination_weight:
+                termination = 0.0
+
+    class algorithm(FixedRobotCfgPPO.algorithm):
+        initial_fill = 0
+        storage_size = 100 * 1024  # 10**6  # 17
+        batch_size = 256  # 4096
+        max_gradient_steps = 1  # 10 # SB3: 1
+        action_max = 2.0
+        action_min = -2.0
+        actor_noise_std = 1.0
+        log_std_max = 4.0
+        log_std_min = -20.0
+        alpha = 0.2
+        target_entropy = -1.0
+        max_grad_norm = 1.0
+        polyak = 0.995  # flipped compared to stable-baselines3 (polyak == 1-tau)
+        gamma = 0.99
+        alpha_lr = 1e-4
+        actor_lr = 1e-3
+        critic_lr = 1e-3
+
+    class runner(FixedRobotCfgPPO.runner):
+        run_name = ""
+        experiment_name = "sac_pendulum"
+        max_iterations = 30_000  # number of policy updates
+        algorithm_class_name = "SAC"
+        save_interval = 5000
+        num_steps_per_env = 1
diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py
new file mode 100644
index 00000000..910fe341
--- /dev/null
+++ b/gym/envs/pendulum/pendulum_config.py
@@ -0,0 +1,133 @@
+import torch
+
+from gym.envs.base.fixed_robot_config import FixedRobotCfg, FixedRobotCfgPPO
+
+
+class PendulumCfg(FixedRobotCfg):
+    class env(FixedRobotCfg.env):
+        num_envs = 4096
+        num_actuators = 1
+        episode_length_s = 10
+
+    class terrain(FixedRobotCfg.terrain):
+        pass
+
+    class viewer:
+        ref_env = 0
+        pos = [10.0, 5.0, 10.0]  # [m]
+        lookat = [0.0, 0.0, 0.0]  # [m]
+
+    class init_state(FixedRobotCfg.init_state):
+        default_joint_angles = {"theta": 0}  # -torch.pi / 2.0}
+
+        # * default setup chooses how the initial conditions are chosen.
+        # * "reset_to_basic" = a single position
+        # * "reset_to_range" = uniformly random from a range defined below
+        reset_mode = "reset_to_range"
+
+        # * initial conditions for reset_to_range
+        dof_pos_range = {
+            "theta": [-torch.pi, torch.pi],
+        }
+        dof_vel_range = {"theta": [-5, 5]}
+
+    class control(FixedRobotCfg.control):
+        actuated_joints_mask = [1]  # angle
+        ctrl_frequency = 25
+        desired_sim_frequency = 200
+        stiffness = {"theta": 0.0}  # [N*m/rad]
+        damping = {"theta": 0.0}  # [N*m*s/rad]
+
+    class asset(FixedRobotCfg.asset):
+        # * Things that differ
+        file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf"
+        disable_gravity = False
+        disable_motors = False  # all torques set to 0
+        joint_damping = 0.1
+        mass = 1.0
+        length = 1.0
+
+    class reward_settings(FixedRobotCfg.reward_settings):
+        tracking_sigma = 0.25
+
+    class scaling(FixedRobotCfg.scaling):
+        dof_vel = 5.0
+        dof_pos = 2.0 * torch.pi
+        # * Action scales
+        tau_ff = 1.0
+
+
+class PendulumRunnerCfg(FixedRobotCfgPPO):
+    seed = -1
+    runner_class_name = "OnPolicyRunner"
+
+    class actor(FixedRobotCfgPPO.actor):
+        frequency = 25
+        hidden_dims = [128, 64, 32]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "tanh"
+
+        # TODO[lm]: Handle normalization in SAC, then also use it here again
+        normalize_obs = False
+        obs = [
+            "dof_pos_obs",
+            "dof_vel",
+        ]
+
+        actions = ["tau_ff"]
+        disable_actions = False
+
+        class noise:
+            dof_pos = 0.0
+            dof_vel = 0.0
+
+    class critic:
+        hidden_dims = [128, 64, 32]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "tanh"
+
+        # TODO[lm]: Handle normalization in SAC, then also use it here again
+        normalize_obs = False
+        obs = [
+            "dof_pos_obs",
+            "dof_vel",
+        ]
+
+        class reward:
+            class weights:
+                theta = 0.1
+                omega = 0.1
+                equilibrium = 1.0
+                energy = 0.5
+                dof_vel = 0.1
+                torques = 0.025
+
+            class termination_weight:
+                termination = 0.0
+
+    class algorithm(FixedRobotCfgPPO.algorithm):
+        # both
+        gamma = 0.95
+        # discount_horizon = 2.0
+        lam = 0.98
+        # shared
+        max_gradient_steps = 24
+        # new
+        storage_size = 2**17  # new
+        batch_size = 2**16  #  new
+        clip_param = 0.2
+        learning_rate = 1.0e-4
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
+        entropy_coef = 0.01
+        schedule = "fixed"  # could be adaptive, fixed
+        desired_kl = 0.01
+
+    class runner(FixedRobotCfgPPO.runner):
+        run_name = ""
+        experiment_name = "pendulum"
+        max_iterations = 200  # number of policy updates
+        algorithm_class_name = "PPO2"
+        num_steps_per_env = 32
diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py
index 90af8bce..8e74d93c 100644
--- a/gym/utils/helpers.py
+++ b/gym/utils/helpers.py
@@ -191,7 +191,7 @@ def get_args(custom_parameters=None):
         {
             "name": "--task",
             "type": str,
-            "default": "cartpole",
+            "default": "lander",
             "help": "Resume training or start testing from a checkpoint. "
             "Overrides config file if provided.",
         },
diff --git a/gym/utils/logging_and_saving/wandb_singleton.py b/gym/utils/logging_and_saving/wandb_singleton.py
index a1b45b8a..bba14fea 100644
--- a/gym/utils/logging_and_saving/wandb_singleton.py
+++ b/gym/utils/logging_and_saving/wandb_singleton.py
@@ -7,7 +7,7 @@
 class WandbSingleton(object):
     def __new__(self):
         if not hasattr(self, "instance"):
-            self.instance = super(WandbSingleton, self).__new__(self)
+            self.instance = super().__new__(self)
             self.entity_name = None
             self.project_name = None
             self.experiment_name = ""
diff --git a/gym/utils/task_registry.py b/gym/utils/task_registry.py
index 6c2025ba..cc0787ca 100644
--- a/gym/utils/task_registry.py
+++ b/gym/utils/task_registry.py
@@ -178,6 +178,9 @@ def set_control_and_sim_dt(self, env_cfg, train_cfg):
                 f" to {env_cfg.sim_dt}."
             )
 
+        if not hasattr(train_cfg.actor, "frequency"):
+            train_cfg.actor.frequency = env_cfg.control.ctrl_frequency
+
     def set_discount_rates(self, train_cfg, dt):
         if hasattr(train_cfg.algorithm, "discount_horizon"):
             hrzn = train_cfg.algorithm.discount_horizon
diff --git a/learning/__init__.py b/learning/__init__.py
index 34f15bd6..15c19314 100644
--- a/learning/__init__.py
+++ b/learning/__init__.py
@@ -29,3 +29,9 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 #
 # Copyright (c) 2021 ETH Zurich, Nikita Rudin
+
+import os
+
+LEGGED_GYM_ROOT_DIR = os.path.dirname(
+    os.path.dirname(os.path.realpath(__file__))
+)
diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py
index 7827379b..7211a720 100644
--- a/learning/algorithms/__init__.py
+++ b/learning/algorithms/__init__.py
@@ -31,4 +31,6 @@
 # Copyright (c) 2021 ETH Zurich, Nikita Rudin
 
 from .ppo import PPO
+from .ppo2 import PPO2
 from .SE import StateEstimator
+from .sac import SAC
diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py
index 9c7454bd..92898ad5 100644
--- a/learning/algorithms/ppo.py
+++ b/learning/algorithms/ppo.py
@@ -69,7 +69,12 @@ def __init__(
         self.actor_critic = actor_critic
         self.actor_critic.to(self.device)
         self.storage = None  # initialized later
-        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=learning_rate)
+        self.optimizer = optim.Adam(
+            self.actor_critic.actor.parameters(), lr=learning_rate
+        )
+        self.crit_optimizer = optim.Adam(
+            self.actor_critic.critic.parameters(), lr=learning_rate
+        )
         self.transition = RolloutStorage.Transition()
 
         # * PPO parameters
@@ -121,6 +126,11 @@ def act(self, obs, critic_obs):
         return self.transition.actions
 
     def process_env_step(self, rewards, dones, timed_out=None):
+        if self.storage is None:
+            raise AttributeError(
+                "This version of PPO is deprecated and only works with OldPolicyRunner."
+                "Use PPO2 instead."
+            )
         self.transition.rewards = rewards.clone()
         self.transition.dones = dones
         # * Bootstrapping on time outs
@@ -156,7 +166,6 @@ def update(self):
             actions_log_prob_batch = self.actor_critic.get_actions_log_prob(
                 actions_batch
             )
-            value_batch = self.actor_critic.evaluate(critic_obs_batch)
             mu_batch = self.actor_critic.action_mean
             sigma_batch = self.actor_critic.action_std
             entropy_batch = self.actor_critic.entropy
@@ -195,6 +204,7 @@ def update(self):
             surrogate_loss = torch.max(surrogate, surrogate_clipped).mean()
 
             # * Value function loss
+            value_batch = self.actor_critic.evaluate(critic_obs_batch)
             if self.use_clipped_value_loss:
                 value_clipped = target_values_batch + (
                     value_batch - target_values_batch
@@ -207,16 +217,25 @@ def update(self):
 
             loss = (
                 surrogate_loss
-                + self.value_loss_coef * value_loss
+                # + self.value_loss_coef * value_loss
                 - self.entropy_coef * entropy_batch.mean()
             )
 
             # * Gradient step
             self.optimizer.zero_grad()
             loss.backward()
-            nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm)
+            nn.utils.clip_grad_norm_(
+                self.actor_critic.actor.parameters(), self.max_grad_norm
+            )
             self.optimizer.step()
 
+            self.crit_optimizer.zero_grad()
+            value_loss.backward()
+            nn.utils.clip_grad_norm_(
+                self.actor_critic.critic.parameters(), self.max_grad_norm
+            )
+            self.crit_optimizer.step()
+
             self.mean_value_loss += value_loss.item()
             self.mean_surrogate_loss += surrogate_loss.item()
 
diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
new file mode 100644
index 00000000..6c64e334
--- /dev/null
+++ b/learning/algorithms/ppo2.py
@@ -0,0 +1,165 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from learning.utils import (
+    create_uniform_generator,
+    compute_generalized_advantages,
+    normalize,
+)
+
+
+class PPO2:
+    def __init__(
+        self,
+        actor,
+        critic,
+        batch_size=2**15,
+        max_gradient_steps=10,
+        clip_param=0.2,
+        gamma=0.998,
+        lam=0.95,
+        entropy_coef=0.0,
+        learning_rate=1e-3,
+        max_grad_norm=1.0,
+        use_clipped_value_loss=True,
+        schedule="fixed",
+        desired_kl=0.01,
+        device="cpu",
+        lr_range=[1e-4, 1e-2],
+        lr_ratio=1.3,
+        **kwargs,
+    ):
+        self.device = device
+
+        self.desired_kl = desired_kl
+        self.schedule = schedule
+        self.learning_rate = learning_rate
+        self.lr_range = lr_range
+        self.lr_ratio = lr_ratio
+
+        # * PPO components
+        self.actor = actor.to(self.device)
+        self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate)
+        self.critic = critic.to(self.device)
+        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate)
+
+        # * PPO parameters
+        self.clip_param = clip_param
+        self.batch_size = batch_size
+        self.max_gradient_steps = max_gradient_steps
+        self.entropy_coef = entropy_coef
+        self.gamma = gamma
+        self.lam = lam
+        self.max_grad_norm = max_grad_norm
+        self.use_clipped_value_loss = use_clipped_value_loss
+
+    def switch_to_train(self):
+        self.actor.train()
+        self.critic.train()
+
+    def act(self, obs):
+        return self.actor.act(obs).detach()
+
+    def update(self, data):
+        data["values"] = self.critic.evaluate(data["critic_obs"])
+        data["advantages"] = compute_generalized_advantages(
+            data, self.gamma, self.lam, self.critic
+        )
+        data["returns"] = data["advantages"] + data["values"]
+        self.update_critic(data)
+        data["advantages"] = normalize(data["advantages"])
+        self.update_actor(data)
+
+    def update_critic(self, data):
+        self.mean_value_loss = 0
+        counter = 0
+
+        generator = create_uniform_generator(
+            data,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
+        for batch in generator:
+            value_loss = self.critic.loss_fn(batch["critic_obs"], batch["returns"])
+            self.critic_optimizer.zero_grad()
+            value_loss.backward()
+            nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
+            self.critic_optimizer.step()
+            self.mean_value_loss += value_loss.item()
+            counter += 1
+        self.mean_value_loss /= counter
+
+    def update_actor(self, data):
+        self.mean_surrogate_loss = 0
+        counter = 0
+
+        self.actor.update_distribution(data["actor_obs"])
+        data["old_sigma_batch"] = self.actor.action_std.detach()
+        data["old_mu_batch"] = self.actor.action_mean.detach()
+        data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob(
+            data["actions"]
+        ).detach()
+
+        generator = create_uniform_generator(
+            data,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
+        for batch in generator:
+            self.actor.update_distribution(batch["actor_obs"])
+            actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"])
+            mu_batch = self.actor.action_mean
+            sigma_batch = self.actor.action_std
+            entropy_batch = self.actor.entropy
+
+            # * KL
+            if self.desired_kl is not None and self.schedule == "adaptive":
+                with torch.inference_mode():
+                    kl = torch.sum(
+                        torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5)
+                        + (
+                            torch.square(batch["old_sigma_batch"])
+                            + torch.square(batch["old_mu_batch"] - mu_batch)
+                        )
+                        / (2.0 * torch.square(sigma_batch))
+                        - 0.5,
+                        axis=-1,
+                    )
+                    kl_mean = torch.mean(kl)
+                    lr_min, lr_max = self.lr_range
+
+                    if kl_mean > self.desired_kl * 2.0:
+                        self.learning_rate = max(
+                            lr_min, self.learning_rate / self.lr_ratio
+                        )
+                    elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
+                        self.learning_rate = min(
+                            lr_max, self.learning_rate * self.lr_ratio
+                        )
+
+                    for param_group in self.optimizer.param_groups:
+                        # ! check this
+                        param_group["lr"] = self.learning_rate
+
+            # * Surrogate loss
+            ratio = torch.exp(
+                actions_log_prob_batch
+                - torch.squeeze(batch["old_actions_log_prob_batch"])
+            )
+            surrogate = -torch.squeeze(batch["advantages"]) * ratio
+            surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp(
+                ratio, 1.0 - self.clip_param, 1.0 + self.clip_param
+            )
+            surrogate_loss = torch.max(surrogate, surrogate_clipped).mean()
+
+            loss = surrogate_loss - self.entropy_coef * entropy_batch.mean()
+
+            # * Gradient step
+            self.optimizer.zero_grad()
+            loss.backward()
+            nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
+            self.optimizer.step()
+            self.mean_surrogate_loss += surrogate_loss.item()
+            counter += 1
+        self.mean_surrogate_loss /= counter
diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py
new file mode 100644
index 00000000..7bda74cb
--- /dev/null
+++ b/learning/algorithms/sac.py
@@ -0,0 +1,268 @@
+import torch
+
+# import torch.nn as nn
+import torch.optim as optim
+
+from learning.utils import create_uniform_generator, polyak_update
+
+
+class SAC:
+    def __init__(
+        self,
+        actor,
+        critic_1,
+        critic_2,
+        target_critic_1,
+        target_critic_2,
+        batch_size=2**15,
+        max_gradient_steps=10,
+        action_max=1.0,
+        action_min=-1.0,
+        actor_noise_std=1.0,
+        log_std_max=4.0,
+        log_std_min=-20.0,
+        alpha=0.2,
+        alpha_lr=1e-4,
+        target_entropy=None,
+        max_grad_norm=1.0,
+        polyak=0.995,
+        gamma=0.99,
+        actor_lr=1e-4,
+        critic_lr=1e-4,
+        device="cpu",
+        **kwargs,
+    ):
+        self.device = device
+
+        # * SAC components
+        self.actor = actor.to(self.device)
+        self.critic_1 = critic_1.to(self.device)
+        self.critic_2 = critic_2.to(self.device)
+        self.target_critic_1 = target_critic_1.to(self.device)
+        self.target_critic_2 = target_critic_2.to(self.device)
+        self.target_critic_1.load_state_dict(self.critic_1.state_dict())
+        self.target_critic_2.load_state_dict(self.critic_2.state_dict())
+
+        self.log_alpha = torch.log(torch.tensor(alpha)).requires_grad_()
+
+        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
+        self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr)
+        self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=critic_lr)
+        self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=critic_lr)
+
+        # this is probably something to put into the neural network
+        self.action_max = action_max
+        self.action_min = action_min
+
+        self.action_delta = (self.action_max - self.action_min) / 2.0
+        self.action_offset = (self.action_max + self.action_min) / 2.0
+
+        self.max_grad_norm = max_grad_norm
+        self.target_entropy = (
+            target_entropy if target_entropy else -self.actor.num_actions
+        )
+
+        # * SAC parameters
+        self.max_gradient_steps = max_gradient_steps
+        self.batch_size = batch_size
+        self.polyak = polyak
+        self.gamma = gamma
+        # self.ent_coef = "fixed"
+        # self.target_entropy = "fixed"
+
+    @property
+    def alpha(self):
+        return self.log_alpha.exp()
+
+    @property
+    def alpha_lr(self):
+        return self.log_alpha_optimizer.param_groups[0]["lr"]
+
+    @property
+    def critic_1_lr(self):
+        return self.critic_1_optimizer.param_groups[0]["lr"]
+
+    @property
+    def critic_2_lr(self):
+        return self.critic_2_optimizer.param_groups[0]["lr"]
+
+    @property
+    def actor_lr(self):
+        return self.actor_optimizer.param_groups[0]["lr"]
+
+    def switch_to_train(self):
+        self.actor.train()
+        self.critic_1.train()
+        self.critic_2.train()
+        self.target_critic_1.train()
+        self.target_critic_2.train()
+
+    def switch_to_eval(self):
+        self.actor.eval()
+        self.critic_1.eval()
+        self.critic_2.eval()
+        self.target_critic_1.eval()
+        self.target_critic_2.eval()
+
+    def act(self, obs):
+        mean, std = self.actor.forward(obs, deterministic=False)
+        distribution = torch.distributions.Normal(mean, std)
+        actions = distribution.rsample()
+        actions_normalized = torch.tanh(actions)
+        # RSL also does a resahpe(-1, self.action_size), not sure why
+        actions_scaled = (
+            actions_normalized * self.action_delta + self.action_offset
+        ).clamp(self.action_min, self.action_max)
+        return actions_scaled
+
+    def act_inference(self, obs):
+        mean = self.actor.forward(obs, deterministic=True)
+        actions_normalized = torch.tanh(mean)
+        actions_scaled = (
+            actions_normalized * self.action_delta + self.action_offset
+        ).clamp(self.action_min, self.action_max)
+        return actions_scaled
+
+    def update(self, data):
+        generator = create_uniform_generator(
+            data,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
+
+        count = 0
+        self.mean_actor_loss = 0
+        self.mean_alpha_loss = 0
+        self.mean_critic_1_loss = 0
+        self.mean_critic_2_loss = 0
+
+        for batch in generator:
+            self.update_critic(batch)
+            self.update_actor_and_alpha(batch)
+
+            count += 1
+        # Update Target Networks
+        self.target_critic_1 = polyak_update(
+            self.critic_1, self.target_critic_1, self.polyak
+        )
+        self.target_critic_2 = polyak_update(
+            self.critic_2, self.target_critic_2, self.polyak
+        )
+        self.mean_actor_loss /= count
+        self.mean_alpha_loss /= count
+        self.mean_critic_1_loss /= count
+        self.mean_critic_2_loss /= count
+
+        return None
+
+    def update_critic(self, batch):
+        critic_obs = batch["critic_obs"]
+        actions = batch["actions"]
+        rewards = batch["rewards"]
+        next_actor_obs = batch["next_actor_obs"]
+        next_critic_obs = batch["next_critic_obs"]
+        dones = batch["dones"]
+
+        with torch.no_grad():
+            # * self._sample_action(actor_next_obs)
+            mean, std = self.actor.forward(next_actor_obs, deterministic=False)
+            distribution = torch.distributions.Normal(mean, std)
+            next_actions = distribution.rsample()
+
+            ## * self._scale_actions(actions, intermediate=True)
+            actions_normalized = torch.tanh(next_actions)
+            # RSL also does a resahpe(-1, self.action_size), not sure why
+            actions_scaled = (
+                actions_normalized * self.action_delta + self.action_offset
+            ).clamp(self.action_min, self.action_max)
+            ## *
+            action_logp = (
+                distribution.log_prob(next_actions)
+                - torch.log(1.0 - actions_normalized.pow(2) + 1e-6)
+            ).sum(-1)
+
+            # * returns target_action = actions_scaled, target_action_logp = action_logp
+            target_action = actions_scaled
+            target_action_logp = action_logp
+
+            # * self._critic_input
+            # ! def should put the action computation into the actor
+            target_critic_in = torch.cat((next_critic_obs, target_action), dim=-1)
+            target_critic_prediction_1 = self.target_critic_1.forward(target_critic_in)
+            target_critic_prediction_2 = self.target_critic_2.forward(target_critic_in)
+
+            target_next = (
+                torch.min(target_critic_prediction_1, target_critic_prediction_2)
+                - self.alpha.detach() * target_action_logp
+            )
+            # the detach inside torch.no_grad() should be redundant
+            target = rewards + self.gamma * dones.logical_not() * target_next
+
+        critic_in = torch.cat((critic_obs, actions), dim=-1)
+
+        # critic_prediction_1 = self.critic_1.forward(critic_in)
+        critic_loss_1 = self.critic_1.loss_fn(critic_in, target)
+        self.critic_1_optimizer.zero_grad()
+        critic_loss_1.backward()
+        # nn.utils.clip_grad_norm_(self.critic_1.parameters(), self.max_grad_norm)
+        self.critic_1_optimizer.step()
+
+        # critic_prediction_2 = self.critic_2.forward(critic_in)
+        critic_loss_2 = self.critic_2.loss_fn(critic_in, target)
+        self.critic_2_optimizer.zero_grad()
+        critic_loss_2.backward()
+        # nn.utils.clip_grad_norm_(self.critic_2.parameters(), self.max_grad_norm)
+        self.critic_2_optimizer.step()
+
+        self.mean_critic_1_loss += critic_loss_1.item()
+        self.mean_critic_2_loss += critic_loss_2.item()
+
+        return
+
+    def update_actor_and_alpha(self, batch):
+        actor_obs = batch["actor_obs"]
+        critic_obs = batch["critic_obs"]
+
+        mean, std = self.actor.forward(actor_obs, deterministic=False)
+        distribution = torch.distributions.Normal(mean, std)
+        actions = distribution.rsample()
+
+        ## * self._scale_actions(actions, intermediate=True)
+        actions_normalized = torch.tanh(actions)
+        # RSL also does a resahpe(-1, self.action_size), not sure why
+        actions_scaled = (
+            actions_normalized * self.action_delta + self.action_offset
+        ).clamp(self.action_min, self.action_max)
+        ## *
+        action_logp = (
+            distribution.log_prob(actions)
+            - torch.log(1.0 - actions_normalized.pow(2) + 1e-6)
+        ).sum(-1)
+
+        # * returns target_action = actions_scaled, target_action_logp = action_logp
+        actor_prediction = actions_scaled
+        actor_prediction_logp = action_logp
+
+        # entropy loss
+        alpha_loss = -(
+            self.log_alpha * (action_logp + self.target_entropy).detach()
+        ).mean()
+
+        self.log_alpha_optimizer.zero_grad()
+        alpha_loss.backward()
+        self.log_alpha_optimizer.step()
+
+        critic_in = torch.cat((critic_obs, actor_prediction), dim=-1)
+        q_value_1 = self.critic_1.forward(critic_in)
+        q_value_2 = self.critic_2.forward(critic_in)
+        actor_loss = (
+            self.alpha.detach() * actor_prediction_logp
+            - torch.min(q_value_1, q_value_2)
+        ).mean()
+        self.actor_optimizer.zero_grad()
+        actor_loss.backward()
+        # nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
+        self.actor_optimizer.step()
+
+        self.mean_alpha_loss += alpha_loss.item()
+        self.mean_actor_loss += actor_loss.item()
diff --git a/learning/modules/QRCritics.py b/learning/modules/QRCritics.py
new file mode 100644
index 00000000..9dd51489
--- /dev/null
+++ b/learning/modules/QRCritics.py
@@ -0,0 +1,805 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from learning.modules.utils import RunningMeanStd, create_MLP
+from learning.modules.utils import (
+    quadratify_xAx,
+    compose_cholesky,
+    create_lower_diagonal,
+    least_squares_fit,
+    forward_affine,
+    create_PD_lower_diagonal,
+)
+
+
+def init_weights(m):
+    if isinstance(m, nn.Linear):
+        torch.nn.init.xavier_uniform_(m.weight)
+        m.bias.data.fill_(0.01)
+
+
+class Critic2(nn.Module):
+    def __init__(
+        self,
+        num_obs,
+        hidden_dims=None,
+        activation="elu",
+        normalize_obs=True,
+        device="cuda",
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.NN = create_MLP(num_obs, 1, hidden_dims, activation, **kwargs)
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_obs)
+        self.value_offset = nn.Parameter(torch.zeros(1, device=device))
+
+    def forward(self, x, return_all=False):
+        if self._normalize_obs:
+            with torch.no_grad():
+                x = self.obs_rms(x)
+        value = self.NN(x).squeeze() + self.value_offset
+        if return_all:
+            return {"value": value}
+        else:
+            return value
+
+    def evaluate(self, critic_observations, return_all=False):
+        return self.forward(critic_observations, return_all=return_all)
+
+    def loss_fn(self, obs, target, **kwargs):
+        return nn.functional.mse_loss(self.forward(obs), target, reduction="mean")
+
+
+class OuterProduct(nn.Module):
+    def __init__(
+        self,
+        num_obs,
+        hidden_dims,
+        latent_dim=None,
+        activation="elu",
+        dropouts=0.0,
+        normalize_obs=False,
+        minimize=False,
+        device="cuda",
+        **kwargs,
+    ):
+        super().__init__()
+        self.device = device
+        if latent_dim is None:
+            latent_dim = num_obs
+        self.latent_dim = latent_dim
+        self.minimize = minimize
+        if minimize:
+            self.sign = torch.ones(1, device=device)
+        else:
+            self.sign = -torch.ones(1, device=device)
+        self.value_offset = nn.Parameter(torch.zeros(1, device=device))
+
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_obs)
+
+        self.NN = create_MLP(num_obs, latent_dim, hidden_dims, activation, dropouts)
+        # todo: have a linear NN to automatically coord-shift input
+
+    def forward(self, x, return_all=False):
+        z = self.NN(x)
+        # outer product. Both of these are equivalent
+        A = z.unsqueeze(-1) @ z.unsqueeze(-2)
+        # A2 = torch.einsum("nmx,nmy->nmxy", z, z)
+        # value = self.sign * quadratify_xAx(x, A)
+        value = quadratify_xAx(x, A)
+        value *= 1.0 if self.minimize else -1.0
+        value += self.value_offset
+
+        if return_all:
+            # return value, A, L
+            return {"value": value, "A": A}
+        else:
+            return value
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+    def loss_fn(self, obs, target, **kwargs):
+        loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean")
+
+        return loss_NN
+
+
+class OuterProductLatent(OuterProduct):
+    def __init__(
+        self,
+        num_obs,
+        hidden_dims,
+        latent_dim=None,
+        activation="elu",
+        dropouts=0.0,
+        normalize_obs=False,
+        minimize=False,
+        latent_hidden_dims=[128, 128],
+        latent_activation="tanh",
+        device="cuda",
+        **kwargs,
+    ):
+        super().__init__(
+            num_obs,
+            hidden_dims,
+            latent_dim,
+            activation,
+            dropouts,
+            normalize_obs,
+            minimize,
+            device,
+            **kwargs,
+        )
+        self.latent_NN = create_MLP(
+            num_obs, latent_dim, latent_hidden_dims, latent_activation
+        )
+
+    def forward(self, x, return_all=False):
+        z = self.NN(x)
+        # outer product. Both of these are equivalent
+        A = z.unsqueeze(-1) @ z.unsqueeze(-2)
+        # A2 = torch.einsum("nmx,nmy->nmxy", z, z)
+        value = self.sign * quadratify_xAx(self.latent_NN(x), A)
+        value += self.value_offset
+
+        if return_all:
+            # return value, A, L
+            return {"value": value, "A": A, "z": z}
+        else:
+            return value
+
+
+class CholeskyInput(nn.Module):
+    def __init__(
+        self,
+        num_obs,
+        hidden_dims,
+        latent_dim=None,
+        activation="elu",
+        dropouts=0.0,
+        normalize_obs=False,
+        minimize=False,
+        device="cuda",
+        **kwargs,
+    ):
+        super().__init__()
+        self.device = device
+        if latent_dim is None:
+            latent_dim = num_obs
+        self.latent_dim = latent_dim
+        self.minimize = minimize
+        if minimize:
+            self.sign = torch.ones(1, device=device)
+        else:
+            self.sign = -torch.ones(1, device=device)
+        self.value_offset = nn.Parameter(torch.zeros(1, device=device))
+        # V = x'Ax + b = a* x'Ax + b, and add a regularization for det(A) ~= 1
+        # self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device))
+
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_obs)
+
+        num_lower_diag_elements = sum(range(latent_dim + 1))
+        self.lower_diag_NN = create_MLP(
+            num_obs, num_lower_diag_elements, hidden_dims, activation, dropouts
+        )
+        # self.lower_diag_NN.apply(init_weights)
+
+    def forward(self, x, return_all=False):
+        output = self.lower_diag_NN(x)
+        L = create_lower_diagonal(output, self.latent_dim, self.device)
+        A = compose_cholesky(L)
+        value = self.sign * quadratify_xAx(x, A)
+        value += self.value_offset
+        # value *= self.scaling_quadratic
+
+        if return_all:
+            # return value, A, L
+            return {"value": value, "A": A, "L": L}
+        else:
+            return value
+        # return self.sign * quadratify_xAx(x, A) + self.value_offset
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+    def loss_fn(self, obs, target, **kwargs):
+        return F.mse_loss(self.forward(obs), target, reduction="mean")
+
+
+class CholeskyLatent(CholeskyInput):
+    def __init__(
+        self,
+        num_obs,
+        hidden_dims,
+        latent_dim,
+        latent_hidden_dims=[64],
+        latent_activation="tanh",
+        latent_dropouts=0.0,
+        activation="elu",
+        dropouts=0.0,
+        normalize_obs=False,
+        minimize=False,
+        device="cuda",
+        **kwargs,
+    ):
+        super().__init__(
+            num_obs,
+            hidden_dims,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropouts=dropouts,
+            normalize_obs=normalize_obs,
+            minimize=minimize,
+            device="cuda",
+            **kwargs,
+        )
+
+        self.latent_NN = create_MLP(
+            num_obs, latent_dim, latent_hidden_dims, latent_activation
+        )
+
+        # self.latent_NN.apply(init_weights)
+
+    def forward(self, x, return_all=False):
+        z = self.latent_NN(x)
+        output = self.lower_diag_NN(x)
+        L = create_lower_diagonal(output, self.latent_dim, self.device)
+        A = compose_cholesky(L)
+
+        value = self.sign * quadratify_xAx(z, A)
+        # do not affect value offset in this part of the loss
+        value += self.value_offset
+        # value *= self.scaling_quadratic
+
+        if return_all:
+            # return value, A, L, z
+            return {"value": value, "A": A, "L": L, "z": z}
+        else:
+            return value
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+
+class PDCholeskyInput(CholeskyInput):
+    def forward(self, x, return_all=False):
+        output = self.lower_diag_NN(x)
+        L = create_PD_lower_diagonal(output, self.latent_dim, self.device)
+        A = compose_cholesky(L)
+        # assert (torch.linalg.eigvals(A).real > 0).all()
+        # value = self.sign * quadratify_xAx(x, A)
+        value = quadratify_xAx(x, A)
+        value *= 1.0 if self.minimize else -1.0
+        value += self.value_offset
+        # value *= self.scaling_quadratic
+
+        if return_all:
+            # return value, A, L
+            return {"value": value, "A": A, "L": L}
+        else:
+            return value
+        # return self.sign * quadratify_xAx(x, A) + self.value_offset
+
+
+class PDCholeskyLatent(CholeskyLatent):
+    def forward(self, x, return_all=False):
+        z = self.latent_NN(x)
+        output = self.lower_diag_NN(x)
+        L = create_PD_lower_diagonal(output, self.latent_dim, self.device)
+        A = compose_cholesky(L)
+        # assert (torch.linalg.eigvals(A).real > 0).all()
+        value = self.sign * quadratify_xAx(z, A)
+        value += self.value_offset
+        # value *= self.scaling_quadratic
+
+        if return_all:
+            # return value, A, L
+            return {"value": value, "A": A, "L": L}
+        else:
+            return value
+
+
+class SpectralLatent(nn.Module):
+    def __init__(
+        self,
+        num_obs,
+        hidden_dims,
+        activation="elu",
+        dropouts=0.0,
+        normalize_obs=False,
+        relative_dim=None,
+        latent_dim=None,
+        latent_hidden_dims=[64],
+        latent_activation="tanh",
+        latent_dropouts=0.0,
+        minimize=False,
+        device="cuda",
+        **kwargs,
+    ):
+        super().__init__()
+        self.device = device
+        if relative_dim is None:
+            relative_dim = num_obs
+        if latent_dim is None:
+            latent_dim = num_obs
+
+        self.num_obs = num_obs
+        self.relative_dim = relative_dim
+        self.latent_dim = latent_dim
+        self.minimize = minimize
+        if minimize:
+            self.sign = torch.ones(1, device=device)
+        else:
+            self.sign = -torch.ones(1, device=device)
+        self.value_offset = nn.Parameter(torch.zeros(1, device=device))
+        # self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device))
+
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_obs)
+        # TODO up to here, same as Cholesky, can be refactored
+        n_outputs = (
+            torch.tril_indices(self.relative_dim, self.latent_dim).shape[1]
+            + relative_dim
+        )
+        self.spectral_NN = create_MLP(
+            num_obs, n_outputs, hidden_dims, activation, dropouts
+        )
+        self.latent_NN = create_MLP(
+            num_obs, latent_dim, latent_hidden_dims, latent_activation
+        )
+
+    def forward(self, x, return_all=False):
+        z = self.latent_NN(x)
+        y = self.spectral_NN(x)
+        A_diag = torch.diag_embed(F.softplus(y[..., : self.relative_dim]))
+        # tril_indices = torch.tril_indices(self.latent_dim, self.relative_dim)
+        tril_indices = torch.tril_indices(self.relative_dim, self.latent_dim)
+        L = torch.zeros(
+            (*x.shape[:-1], self.relative_dim, self.latent_dim), device=self.device
+        )
+        L[..., tril_indices[0], tril_indices[1]] = y[..., self.relative_dim :]
+        # Compute (L^T A_diag) L
+        A = torch.einsum(
+            "...ij,...jk->...ik",
+            torch.einsum("...ik,...kj->...ij", L.transpose(-1, -2), A_diag),
+            L,
+        )
+        # assert (torch.linalg.eigvals(A).real >= -1e-6).all()
+        # ! This fails, but so far with just -e-10, so really really small...
+        value = self.sign * quadratify_xAx(z, A)
+        # with torch.no_grad():
+        value += self.value_offset
+
+        if return_all:
+            # return value, z, torch.diag(A_diag), L, A
+            return {
+                "value": value,
+                "z": z,
+                # "A_diag": torch.diag(A_diag), #! throws shape error
+                "L": L,
+                "A": A,
+            }
+        else:
+            return value
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+    def loss_fn(self, obs, target, **kwargs):
+        loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean")
+
+        # if self.minimize:
+        #     loss_offset = (self.value_offset / target.min() - 1.0).pow(2)
+        # else:
+        #     loss_offset = (self.value_offset / target.max() - 1.0).pow(2)
+
+        # loss_scaling = (self.scaling_quadratic - target.mean()).pow(2)
+
+        return loss_NN  # + loss_offset  # + loss_scaling
+
+
+class DenseSpectralLatent(nn.Module):
+    def __init__(
+        self,
+        num_obs,
+        hidden_dims,
+        activation="elu",
+        dropouts=0.0,
+        normalize_obs=False,
+        relative_dim=None,
+        latent_dim=None,
+        latent_hidden_dims=[64],
+        latent_activation="tanh",
+        latent_dropouts=0.0,
+        minimize=False,
+        device="cuda",
+        **kwargs,
+    ):
+        super().__init__()
+        self.device = device
+        if relative_dim is None:
+            relative_dim = num_obs
+        if latent_dim is None:
+            latent_dim = num_obs
+
+        self.num_obs = num_obs
+        self.relative_dim = relative_dim
+        self.latent_dim = latent_dim
+        self.minimize = minimize
+        if minimize:
+            self.sign = torch.ones(1, device=device)
+        else:
+            self.sign = -torch.ones(1, device=device)
+        self.value_offset = nn.Parameter(torch.zeros(1, device=device))
+        # self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device))
+
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_obs)
+        # TODO up to here, same as Cholesky, can be refactored
+        n_zeros = sum(range(relative_dim))
+        n_outputs = relative_dim + relative_dim * latent_dim - n_zeros
+        self.rel_fill = n_zeros
+        # n_outputs = (
+        #     relative_dim * latent_dim
+        #     - torch.tril_indices(self.relative_dim, self.latent_dim).shape[1]
+        #     + relative_dim
+        # )
+        self.spectral_NN = create_MLP(
+            num_obs, n_outputs, hidden_dims, activation, dropouts
+        )
+        self.latent_NN = create_MLP(
+            num_obs, latent_dim, latent_hidden_dims, latent_activation
+        )
+
+    def forward(self, x, return_all=False):
+        z = self.latent_NN(x)
+        y = self.spectral_NN(x)
+        A_diag = torch.diag_embed(F.softplus(y[..., : self.relative_dim]))
+        # tril_indices = torch.tril_indices(self.latent_dim, self.relative_dim)
+        tril_indices = torch.tril_indices(self.relative_dim, self.latent_dim)
+        L = torch.zeros(
+            (*x.shape[:-1], self.relative_dim, self.latent_dim), device=self.device
+        )
+        L[..., tril_indices[0], tril_indices[1]] = y[
+            ..., self.relative_dim : self.relative_dim + tril_indices.shape[1]
+        ]
+        L[..., :, self.relative_dim :] = y[
+            ..., self.relative_dim + tril_indices.shape[1] :
+        ].view(L[..., :, self.relative_dim :].shape)
+
+        # Compute (L^T A_diag) L
+        A = torch.einsum(
+            "...ij,...jk->...ik",
+            torch.einsum("...ik,...kj->...ij", L.transpose(-1, -2), A_diag),
+            L,
+        )
+        # A = torch.einsum(
+        #     "...ij,...jk->...ik",
+        #     torch.einsum("...ik,...kj->...ij", L, A_diag),
+        #     L.transpose(-1, -2),
+        # )
+        # assert (torch.linalg.eigvals(A).real >= -1e-6).all()
+        # ! This fails, but so far with just -e-10, so really really small...
+        value = self.sign * quadratify_xAx(z, A)
+        # with torch.no_grad():
+        value += self.value_offset
+
+        if return_all:
+            # return value, z, torch.diag(A_diag), L, A
+            return {
+                "value": value,
+                "z": z,
+                # "A_diag": torch.diag(A_diag), #! throws shape error
+                "L": L,
+                "A": A,
+            }
+        else:
+            return value
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+    def loss_fn(self, obs, target, **kwargs):
+        loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean")
+
+        # if self.minimize:
+        #     loss_offset = (self.value_offset / target.min() - 1.0).pow(2)
+        # else:
+        #     loss_offset = (self.value_offset / target.max() - 1.0).pow(2)
+
+        # loss_scaling = (self.scaling_quadratic - target.mean()).pow(2)
+
+        return loss_NN  # + loss_offset  # + loss_scaling
+
+
+class QR(nn.Module):
+    def __init__(
+        self,
+        num_obs,
+        hidden_dims,
+        action_dim,
+        latent_dim=None,
+        activation="elu",
+        dropouts=0.0,
+        normalize_obs=False,
+        minimize=False,
+        device="cuda",
+        **kwargs,
+    ):
+        super().__init__()
+        self.device = device
+        if latent_dim is None:
+            latent_dim = num_obs
+        self.latent_dim = latent_dim
+        self.action_dim = action_dim
+        self.minimize = minimize
+        if minimize:
+            self.sign = torch.ones(1, device=device)
+        else:
+            self.sign = -torch.ones(1, device=device)
+
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_obs)
+
+        # Make Q estimator
+        num_lower_diag_elements_Q = sum(range(self.latent_dim + 1))
+        self.lower_diag_NN_Q = create_MLP(
+            latent_dim, num_lower_diag_elements_Q, hidden_dims, activation, dropouts
+        )
+        # self.lower_diag_NN_Q.apply(init_weights)
+
+        # Make R estimator
+        num_lower_diag_elements_R = sum(range(action_dim + 1))
+        self.lower_diag_NN_R = create_MLP(
+            latent_dim, num_lower_diag_elements_R, hidden_dims, activation, dropouts
+        )
+        # self.lower_diag_NN_R.apply(init_weights)
+
+    def forward(self, z, u, return_all=False):
+        Q_lower_diag = self.lower_diag_NN_Q(z)
+        L_Q = create_lower_diagonal(Q_lower_diag, self.latent_dim, self.device)
+        Q = compose_cholesky(L_Q)
+
+        R_lower_diag = self.lower_diag_NN_R(z)
+        L_R = create_lower_diagonal(R_lower_diag, self.action_dim, self.device)
+        R = compose_cholesky(L_R)
+
+        if return_all:
+            # return Q, L_Q, R, L_R
+            return {"Q": Q, "L_Q": L_Q, "R": R, "L_R": L_R}
+        else:
+            return Q, R
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+    def loss_fn(self, z, target, actions, **kwargs):
+        Q, R = self.forward(z, target)
+        Q_value = self.sign * quadratify_xAx(z, Q)
+        R_value = self.sign * quadratify_xAx(actions, R)
+        gae_loss = F.mse_loss(Q_value + R_value, target, reduction="mean")
+        return gae_loss
+
+    def riccati_loss_fn(self, z, target, P, **kwargs):
+        Q, R = self.forward(z, target)
+        A, B = self.linearize_pendulum_dynamics()
+        A = A.to(self.device)
+        B = B.to(self.device)
+        AT_P_A = torch.einsum(
+            "...ij, ...ik -> ...ik",
+            torch.einsum("...ij, ...jk -> ...ik", A.transpose(-1, -2), P),
+            A,
+        )
+        AT_P_B = torch.einsum(
+            "...ij, ...ik -> ...ik",
+            torch.einsum("...ij, ...jk -> ...ik", A.transpose(-1, -2), P),
+            B,
+        )
+        R_BT_B = R + torch.einsum("...ij, ...jk -> ...ik", B.transpose(-1, -2), B)
+        BT_P_A = torch.einsum(
+            "...ij, ...ik -> ...ik",
+            torch.einsum("...ij, ...jk -> ...ik", B.transpose(-1, -2), P),
+            A,
+        )
+        final_term = torch.einsum(
+            "...ij, ...jk -> ...ik",
+            torch.einsum("...ij, ...jk -> ...ik", AT_P_B, torch.linalg.inv(R_BT_B)),
+            BT_P_A,
+        )
+        riccati_loss = Q + AT_P_A - P - final_term
+        return F.mse_loss(riccati_loss, target=0.0, reduction="mean")
+
+    def linearize_pendulum_dynamics(self, x_desired=torch.tensor([0.0, 0.0])):
+        m = 1.0
+        b = 0.1
+        length = 1.0
+        g = 9.81
+        ml2 = m * length**2
+
+        A = torch.tensor(
+            [[0.0, 1.0], [g / length * torch.cos(x_desired[0]), -b / ml2]],
+            device=self.device,
+        )
+        B = torch.tensor([[0.0], [(1.0 / ml2)]], device=self.device)
+        return A, B
+
+
+class QPNet(nn.Module):
+    def __init__(
+        self,
+        num_obs,
+        hidden_dims,
+        latent_dim=None,
+        activation="elu",
+        dropouts=0.0,
+        normalize_obs=False,
+        minimize=False,
+        device="cuda",
+        **kwargs,
+    ):
+        super().__init__()
+        self.device = device
+        if latent_dim is None:
+            latent_dim = num_obs
+        self.latent_dim = latent_dim
+        self.minimize = minimize
+        if minimize:
+            self.sign = torch.ones(1, device=device)
+        else:
+            self.sign = -torch.ones(1, device=device)
+        self.value_offset = nn.Parameter(torch.zeros(1, device=device))
+        # V = x'Ax + b = a* x'Ax + b, and add a regularization for det(A) ~= 1
+        self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device))
+
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_obs)
+
+        self.num_lower_diag_elements = sum(range(latent_dim + 1))
+        self.lower_diag_NN = create_MLP(
+            num_obs,
+            self.num_lower_diag_elements + num_obs,
+            hidden_dims,
+            activation,
+            dropouts,
+        )  # ! num_obs added to output to create c for c.T@x
+        # self.lower_diag_NN.apply(init_weights)
+
+    def forward(self, x, return_all=False):
+        res = self.lower_diag_NN(x)
+        output = res[..., : self.num_lower_diag_elements]
+        c = res[..., self.num_lower_diag_elements :]
+        L = create_lower_diagonal(output, self.latent_dim, self.device)
+        A = compose_cholesky(nn.ReLU()(L))
+        value = self.sign * quadratify_xAx(x, A) + torch.einsum(
+            "...ij, ...ij -> ...i", c, x
+        )
+        value += self.value_offset
+        # value *= self.scaling_quadratic
+
+        if return_all:
+            # return value, A, L
+            return {"value": value, "A": A, "L": L, "c": c}
+        else:
+            return value
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+    def loss_fn(self, obs, target, **kwargs):
+        loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean")
+
+        # loss_scaling = (self.scaling_quadratic - target.mean()).pow(2)
+
+        return loss_NN  #  + loss_offset + loss_scaling
+
+
+class NN_wRiccati(nn.Module):
+    def __init__(self, **kwargs):
+        super().__init__()
+        # assert (
+        #     "critic_name" in kwargs.keys()
+        # ), "Name of custom critic not specified in NN with Riccati init"
+        # assert "action_dim" in kwargs.keys(), "Dimension of NN actions not specified."
+        critic_name = kwargs["critic_name"]
+        device = kwargs["device"]
+        self.has_latent = False if kwargs["latent_dim"] is None else True
+        self.critic = eval(f"{critic_name}(**kwargs).to(device)")
+        self.QR_network = QR(**kwargs).to(device)
+
+    def forward(self, x, return_all=False):
+        return self.critic.forward(x, return_all)
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+    def loss_fn(self, obs, target, actions, **kwargs):
+        return self.value_loss(obs, target, actions) + self.reg_loss(
+            obs, target, actions
+        )
+
+    def value_loss(self, obs, target, actions, **kwargs):
+        return self.critic.loss_fn(obs, target, actions=actions)
+
+    def reg_loss(self, obs, target, actions, **kwargs):
+        with torch.no_grad():
+            P = self.critic(obs, return_all=True)["A"]
+        z = obs if self.has_latent is False else self.critic.latent_NN(obs)
+        return self.QR_network.riccati_loss_fn(z, target, P=P)
+
+
+class NN_wQR(nn.Module):
+    def __init__(self, **kwargs):
+        super().__init__()
+        # assert (
+        #     "critic_name" in kwargs.keys()
+        # ), "Name of custom critic not specified in NN with Riccati init"
+        # assert "action_dim" in kwargs.keys(), "Dimension of NN actions not specified."
+        critic_name = kwargs["critic_name"]
+        device = kwargs["device"]
+        self.has_latent = False if kwargs["latent_dim"] is None else True
+        self.critic = eval(f"{critic_name}(**kwargs).to(device)")
+        self.QR_network = QR(**kwargs).to(device)
+
+    def forward(self, x, return_all=False):
+        return self.critic.forward(x, return_all)
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+    def loss_fn(self, obs, target, actions, **kwargs):
+        return self.value_loss(obs, target, actions) + self.reg_loss(
+            obs, target, actions
+        )
+
+    def value_loss(self, obs, target, actions, **kwargs):
+        return self.critic.loss_fn(obs, target, actions=actions)
+
+    def reg_loss(self, obs, target, actions, **kwargs):
+        z = obs if self.has_latent is False else self.critic.latent_NN(obs)
+        return self.QR_network.loss_fn(z, target, actions=actions)
+
+
+class NN_wLinearLatent(nn.Module):
+    def __init__(self, **kwargs):
+        super().__init__()
+        critic_name = kwargs["critic_name"]
+        device = kwargs["device"]  # noqa
+        self.critic = eval(f"{critic_name}(**kwargs).to(device)")
+
+    @property
+    def value_offset(self):
+        return self.critic.value_offset
+
+    def forward(self, x, return_all=False):
+        return self.critic.forward(x, return_all)
+
+    def evaluate(self, obs, return_all=False):
+        return self.forward(obs, return_all)
+
+    def loss_fn(self, obs, target, actions, **kwargs):
+        return self.value_loss(obs, target, actions) + self.reg_loss(
+            obs, target, actions
+        )
+
+    def value_loss(self, obs, target, actions, **kwargs):
+        return self.critic.loss_fn(obs, target, actions=actions)
+
+    def reg_loss(self, obs, target, actions, **kwargs):
+        z = self.critic.latent_NN(obs)
+        u = actions.detach().clone()
+        u = u.view(-1, u.shape[-1])
+        K, z_offset = least_squares_fit(z, u)
+        return F.mse_loss(forward_affine(z, K, z_offset), target=u, reduction="mean")
diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py
index 75cca5e2..9505377d 100644
--- a/learning/modules/__init__.py
+++ b/learning/modules/__init__.py
@@ -33,3 +33,7 @@
 from .actor_critic import ActorCritic
 from .state_estimator import StateEstimatorNN
 from .actor import Actor
+from .critic import Critic
+from .chimera_actor import ChimeraActor
+from .smooth_actor import SmoothActor
+from .QRCritics import *
diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index f9c82396..0f112dba 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -17,11 +17,6 @@ def __init__(
         normalize_obs=True,
         **kwargs,
     ):
-        if kwargs:
-            print(
-                "Actor.__init__ got unexpected arguments, "
-                "which will be ignored: " + str([key for key in kwargs.keys()])
-            )
         super().__init__()
 
         self._normalize_obs = normalize_obs
@@ -30,7 +25,9 @@ def __init__(
 
         self.num_obs = num_obs
         self.num_actions = num_actions
-        self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation)
+        self.hidden_dims = hidden_dims
+        self.activation = activation
+        self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation, **kwargs)
 
         # Action noise
         self.std = nn.Parameter(init_noise_std * torch.ones(num_actions))
@@ -46,14 +43,16 @@ def action_mean(self):
     def action_std(self):
         return self.distribution.stddev
 
+    @property
+    def get_std(self):
+        return self.distribution.stddev
+
     @property
     def entropy(self):
         return self.distribution.entropy().sum(dim=-1)
 
     def update_distribution(self, observations):
-        if self._normalize_obs:
-            observations = self.normalize(observations)
-        mean = self.NN(observations)
+        mean = self.act_inference(observations)
         self.distribution = Normal(mean, mean * 0.0 + self.std)
 
     def act(self, observations):
@@ -65,16 +64,16 @@ def get_actions_log_prob(self, actions):
 
     def act_inference(self, observations):
         if self._normalize_obs:
-            observations = self.normalize(observations)
-        actions_mean = self.NN(observations)
-        return actions_mean
+            with torch.no_grad():
+                observations = self.obs_rms(observations)
+        return self.NN(observations)
 
     def forward(self, observations):
         return self.act_inference(observations)
 
-    def normalize(self, observation):
-        with torch.no_grad():
-            return self.obs_rms(observation)
-
     def export(self, path):
         export_network(self, "policy", path, self.num_obs)
+
+    def log_actions(self, mean, sample, path):
+        with open(path, "a") as f:
+            f.write(str(mean.item()) + ", " + str(sample.item()) + "\n")
diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py
index ed837f5e..cec8cbb5 100644
--- a/learning/modules/actor_critic.py
+++ b/learning/modules/actor_critic.py
@@ -1,44 +1,17 @@
 import torch.nn as nn
 
-from .actor import Actor
-from .critic import Critic
-
 
 class ActorCritic(nn.Module):
-    def __init__(
-        self,
-        num_actor_obs,
-        num_critic_obs,
-        num_actions,
-        actor_hidden_dims=[256, 256, 256],
-        critic_hidden_dims=[256, 256, 256],
-        activation="elu",
-        init_noise_std=1.0,
-        normalize_obs=True,
-        **kwargs,
-    ):
+    def __init__(self, actor, critic, **kwargs):
         if kwargs:
             print(
                 "ActorCritic.__init__ got unexpected arguments, "
                 "which will be ignored: " + str([key for key in kwargs.keys()])
             )
-        super(ActorCritic, self).__init__()
-
-        self.actor = Actor(
-            num_actor_obs,
-            num_actions,
-            actor_hidden_dims,
-            activation,
-            init_noise_std,
-            normalize_obs,
-        )
-
-        self.critic = Critic(
-            num_critic_obs, critic_hidden_dims, activation, normalize_obs
-        )
+        super().__init__()
 
-        print(f"Actor MLP: {self.actor.NN}")
-        print(f"Critic MLP: {self.critic.NN}")
+        self.actor = actor
+        self.critic = critic
 
     @property
     def action_mean(self):
diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py
new file mode 100644
index 00000000..1d5ead41
--- /dev/null
+++ b/learning/modules/chimera_actor.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn
+from torch.distributions import Normal
+from .utils import create_MLP
+from .utils import export_network
+from .utils import RunningMeanStd
+
+
+class ChimeraActor(nn.Module):
+    def __init__(
+        self,
+        num_obs,
+        num_actions,
+        # hidden_dims,
+        # activation,
+        nn_params,
+        std_init=1.0,
+        log_std_max=4.0,
+        log_std_min=-20.0,
+        normalize_obs=True,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_obs)
+
+        self.log_std_max = log_std_max
+        self.log_std_min = log_std_min
+        self.log_std_init = torch.tensor([std_init]).log()  # refactor
+
+        self.num_obs = num_obs
+        self.num_actions = num_actions
+
+        self.latent_NN = create_MLP(
+            num_inputs=num_obs,
+            num_outputs=nn_params["latent"]["hidden_dims"][-1],
+            **nn_params["latent"],
+        )
+        self.mean_NN = create_MLP(
+            num_inputs=nn_params["latent"]["hidden_dims"][-1],
+            num_outputs=num_actions,
+            **nn_params["mean"],
+        )
+        self.std_NN = create_MLP(
+            num_inputs=nn_params["latent"]["hidden_dims"][-1],
+            num_outputs=num_actions,
+            **nn_params["std"],
+        )
+
+        # maybe zap
+        self.distribution = Normal(torch.zeros(num_actions), torch.ones(num_actions))
+        Normal.set_default_validate_args = False
+
+    def forward(self, x, deterministic=True):
+        if self._normalize_obs:
+            with torch.no_grad():
+                x = self.obs_rms(x)
+        latent = self.latent_NN(x)
+        mean = self.mean_NN(latent)
+        if deterministic:
+            return mean
+        log_std = self.log_std_init + self.std_NN(latent)
+        return mean, log_std.clamp(self.log_std_min, self.log_std_max).exp()
+
+    def act(self, x):
+        mean, std = self.forward(x, deterministic=False)
+        self.distribution = Normal(mean, std)
+        return self.distribution.sample()
+
+    def inference_policy(self, x):
+        return self.forward(x, deterministic=True)
+
+    def export(self, path):
+        export_network(self.inference_policy, "policy", path, self.num_obs)
+
+    def to(self, device):
+        super().to(device)
+        self.log_std_init = self.log_std_init.to(device)
+        return self
diff --git a/learning/modules/critic.py b/learning/modules/critic.py
index 47e0decf..5dfad32d 100644
--- a/learning/modules/critic.py
+++ b/learning/modules/critic.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+
 from .utils import create_MLP
 from .utils import RunningMeanStd
 
@@ -8,28 +9,26 @@ class Critic(nn.Module):
     def __init__(
         self,
         num_obs,
-        hidden_dims,
+        hidden_dims=None,
         activation="elu",
         normalize_obs=True,
         **kwargs,
     ):
-        if kwargs:
-            print(
-                "Critic.__init__ got unexpected arguments, "
-                "which will be ignored: " + str([key for key in kwargs.keys()])
-            )
         super().__init__()
 
-        self.NN = create_MLP(num_obs, 1, hidden_dims, activation)
+        self.NN = create_MLP(num_obs, 1, hidden_dims, activation, **kwargs)
         self._normalize_obs = normalize_obs
         if self._normalize_obs:
             self.obs_rms = RunningMeanStd(num_obs)
 
-    def evaluate(self, critic_observations):
+    def forward(self, x):
         if self._normalize_obs:
-            critic_observations = self.normalize(critic_observations)
-        return self.NN(critic_observations).squeeze()
+            with torch.no_grad():
+                x = self.obs_rms(x)
+        return self.NN(x).squeeze()
+
+    def evaluate(self, critic_observations):
+        return self.forward(critic_observations)
 
-    def normalize(self, observation):
-        with torch.no_grad():
-            return self.obs_rms(observation)
+    def loss_fn(self, obs, target, **kwargs):
+        return nn.functional.mse_loss(self.forward(obs), target, reduction="mean")
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
new file mode 100644
index 00000000..c02e9203
--- /dev/null
+++ b/learning/modules/smooth_actor.py
@@ -0,0 +1,125 @@
+import torch
+import torch.nn as nn
+from torch.distributions import Normal
+
+from .actor import Actor
+from .utils import create_MLP
+
+
+# The following implementation is based on the gSDE paper. See code:
+# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
+class SmoothActor(Actor):
+    weights_dist: Normal
+    _latent_sde: torch.Tensor
+    exploration_matrices: torch.Tensor
+
+    def __init__(
+        self,
+        *args,
+        full_std: bool = True,
+        use_exp_ln: bool = True,
+        learn_features: bool = True,
+        epsilon: float = 1e-6,
+        log_std_init: float = 0.0,
+        exploration_sample_freq: int = 16,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.full_std = full_std
+        self.use_exp_ln = use_exp_ln
+        self.learn_features = learn_features
+        self.epsilon = epsilon
+        self.log_std_init = log_std_init
+
+        # Create latent NN and last layer
+        self.latent_net = create_MLP(
+            self.num_obs, self.hidden_dims[-1], self.hidden_dims[:-1], self.activation
+        )
+        self.latent_dim = self.hidden_dims[-1]
+        self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions)
+        # Reduce the number of parameters if needed
+        if self.full_std:
+            log_std = torch.ones(self.latent_dim, self.num_actions)
+        else:
+            log_std = torch.ones(self.latent_dim, 1)
+        self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True)
+        # Sample an exploration matrix
+        self.update_exploration_matrices(batch_size=1)
+        self.distribution = None
+
+        self.call_counter = 0
+        self.resampling_frequenchy = exploration_sample_freq
+
+    def update_exploration_matrices(self, batch_size=1):
+        # Sample weights for the noise exploration matrix
+        std = self.get_std
+        self.weights_dist = Normal(torch.zeros_like(std), std)
+        # Pre-compute matrices in case of parallel exploration
+        self.exploration_matrices = self.weights_dist.rsample((batch_size,))
+
+    @property
+    def get_std(self):
+        # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally
+        if self.use_exp_ln:
+            # From gSDE paper, it allows to keep variance
+            # above zero and prevent it from growing too fast
+            below_threshold = torch.exp(self.log_std) * (self.log_std <= 0)
+            # Avoid NaN: zeros values that are below zero
+            safe_log_std = self.log_std * (self.log_std > 0) + self.epsilon
+            above_threshold = (torch.log1p(safe_log_std) + 1.0) * (self.log_std > 0)
+            std = below_threshold + above_threshold
+        else:
+            # Use normal exponential
+            std = torch.exp(self.log_std)
+
+        if self.full_std:
+            return std
+        assert self.latent_dim is not None
+        # Reduce the number of parameters:
+        return torch.ones(self.latent_dim, 1).to(self.log_std.device) * std
+
+    def update_distribution(self, observations):
+        if self._normalize_obs:
+            with torch.no_grad():
+                observations = self.obs_rms(observations)
+        # Get latent features and compute distribution
+        self._latent_sde = self.latent_net(observations)
+        if not self.learn_features:
+            self._latent_sde = self._latent_sde.detach()
+        if self._latent_sde.dim() == 2:
+            variance = torch.mm(self._latent_sde**2, self.get_std**2)
+        elif self._latent_sde.dim() == 3:
+            variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2)
+        else:
+            raise ValueError("Invalid latent_sde dimension")
+        mean_actions = self.mean_actions_net(self._latent_sde)
+        self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
+
+    def act(self, observations):
+        if self.call_counter % self.resampling_frequenchy == 0:
+            self.update_exploration_matrices(observations.shape[0])
+        self.call_counter += 1
+
+        self.update_distribution(observations)
+        mean = self.distribution.mean
+        sample = mean + self.get_noise()
+        return sample
+
+    def act_inference(self, observations):
+        if self._normalize_obs:
+            with torch.no_grad():
+                observations = self.obs_rms(observations)
+        latent_sde = self.latent_net(observations)
+        mean_actions = self.mean_actions_net(latent_sde)
+        return mean_actions
+
+    def get_noise(self):
+        latent_sde = self._latent_sde
+        if not self.learn_features:
+            latent_sde = latent_sde.detach()
+        # Use batch matrix multiplication for efficient computation
+        # (batch_size, n_features) -> (batch_size, 1, n_features)
+        latent_sde = latent_sde.unsqueeze(dim=1)
+        # (batch_size, 1, n_actions)
+        noise = torch.bmm(latent_sde, self.exploration_matrices.to(latent_sde.device))
+        return noise.squeeze(dim=1)
diff --git a/learning/modules/state_estimator.py b/learning/modules/state_estimator.py
index f8cfe221..dfba137d 100644
--- a/learning/modules/state_estimator.py
+++ b/learning/modules/state_estimator.py
@@ -20,7 +20,7 @@ def __init__(
         num_outputs,
         hidden_dims=[256, 128],
         activation="elu",
-        dropouts=None,
+        dropouts=0.0,
         **kwargs,
     ):
         if kwargs:
diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py
index 1422b82d..35868ca0 100644
--- a/learning/modules/utils/__init__.py
+++ b/learning/modules/utils/__init__.py
@@ -1,2 +1,3 @@
 from .neural_net import create_MLP, export_network
 from .normalize import RunningMeanStd
+from .matfncs import create_lower_diagonal, create_PD_lower_diagonal, compose_cholesky, quadratify_xAx, least_squares_fit, forward_affine
\ No newline at end of file
diff --git a/learning/modules/utils/matfncs.py b/learning/modules/utils/matfncs.py
new file mode 100644
index 00000000..92ebd0e7
--- /dev/null
+++ b/learning/modules/utils/matfncs.py
@@ -0,0 +1,53 @@
+import torch
+
+
+def create_lower_diagonal(x, n, device):
+    L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False)
+    tril_indices = torch.tril_indices(row=n, col=n, offset=0)
+    rows, cols = tril_indices
+    L[..., rows, cols] = x
+    return L
+
+
+def create_PD_lower_diagonal(x, n, device):
+    tril_indices = torch.tril_indices(row=n, col=n, offset=0)
+    diag_indices = (tril_indices[0] == tril_indices[1]).nonzero(as_tuple=True)[0]
+    x[..., diag_indices] = F.softplus(x[..., diag_indices])
+    L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False)
+    rows, cols = tril_indices
+    L[..., rows, cols] = x
+    return L
+
+
+def compose_cholesky(L):
+    return torch.einsum("...ij, ...jk -> ...ik", L, L.transpose(-2, -1))
+
+
+def quadratify_xAx(x, A):
+    return torch.einsum(
+        "...ij, ...jk -> ...ik",
+        torch.einsum("...ij, ...jk -> ...ik", x.unsqueeze(-1).transpose(-2, -1), A),
+        x.unsqueeze(-1),
+    ).squeeze()
+
+
+def least_squares_fit(x, y):
+    x_flat = x.view(-1, x.shape[-1])
+    y_flat = y.view(-1, y.shape[-1])
+    ones = torch.ones(x_flat.shape[0], 1, device=x.device)
+    x_aug = torch.cat([ones, x_flat], dim=1)
+    # Use torch.linalg.lstsq to find the least-squares solution
+    result = torch.linalg.lstsq(x_aug, y_flat)
+    coefficients = result.solution
+    bias = coefficients[0]
+    weights = coefficients[1:]
+
+    return weights, bias
+
+
+def forward_affine(x, weights, bias):
+    x_flat = x.view(-1, x.shape[-1])
+    x_aug = torch.cat(
+        [torch.ones(x_flat.shape[0], 1, device=x_flat.device), x_flat], dim=1
+    )
+    return x_aug.matmul(torch.cat([bias.unsqueeze(0), weights], dim=0))
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 5fd6ab3b..27b47a54 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -3,29 +3,64 @@
 import copy
 
 
-def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None):
-    activation = get_activation(activation)
-
-    if dropouts is None:
-        dropouts = [0] * len(hidden_dims)
+def create_MLP(
+    num_inputs,
+    num_outputs,
+    hidden_dims,
+    activations=None,
+    dropouts=0.0,
+    layer_norm=False,
+    just_list=False,
+    **kwargs,
+):
+    if not isinstance(activations, list):
+        activations = [activations] * len(hidden_dims)
+    if not isinstance(dropouts, list):
+        dropouts = [dropouts] * len(hidden_dims)
+    if not isinstance(layer_norm, list):
+        layer_norm = [layer_norm] * len(hidden_dims)
 
     layers = []
-    if not hidden_dims:  # handle no hidden layers
+    # first layer
+    if len(hidden_dims) > 0:
+        add_layer(
+            layers,
+            num_inputs,
+            hidden_dims[0],
+            activations[0],
+            dropouts[0],
+            layer_norm[0],
+        )
+        for i in range(len(hidden_dims) - 1):
+            add_layer(
+                layers,
+                hidden_dims[i],
+                hidden_dims[i + 1],
+                activations[i + 1],
+                dropouts[i + 1],
+                layer_norm=layer_norm[i + 1],
+            )
+        else:
+            add_layer(layers, hidden_dims[-1], num_outputs)
+    else:  # handle no hidden dims, just linear layer
         add_layer(layers, num_inputs, num_outputs)
+
+    if just_list:
+        return layers
     else:
-        add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0])
-        for i in range(len(hidden_dims)):
-            if i == len(hidden_dims) - 1:
-                add_layer(layers, hidden_dims[i], num_outputs)
-            else:
-                add_layer(
-                    layers,
-                    hidden_dims[i],
-                    hidden_dims[i + 1],
-                    activation,
-                    dropouts[i + 1],
-                )
-    return torch.nn.Sequential(*layers)
+        return torch.nn.Sequential(*layers)
+
+
+def add_layer(
+    layer_list, num_inputs, num_outputs, activation=None, dropout=0, layer_norm=False
+):
+    layer_list.append(torch.nn.Linear(num_inputs, num_outputs))
+    if layer_norm:
+        layer_list.append(torch.nn.LayerNorm(num_outputs))
+    if dropout > 0:
+        layer_list.append(torch.nn.Dropout(p=dropout))
+    if activation is not None:
+        layer_list.append(get_activation(activation))
 
 
 def get_activation(act_name):
@@ -36,26 +71,28 @@ def get_activation(act_name):
     elif act_name == "relu":
         return torch.nn.ReLU()
     elif act_name == "crelu":
-        return torch.nn.ReLU()
+        return torch.nn.CELU()
     elif act_name == "lrelu":
         return torch.nn.LeakyReLU()
     elif act_name == "tanh":
         return torch.nn.Tanh()
     elif act_name == "sigmoid":
         return torch.nn.Sigmoid()
+    elif act_name == "softplus":
+        return torch.nn.Softplus()
+    elif act_name == "softmax":
+        return torch.nn.Softmax(dim=-1)
+    elif act_name == "randrelu":
+        return torch.nn.RReLU()
+    elif act_name == "softsign":
+        return torch.nn.Softsign()
+    elif act_name == "mish":
+        return torch.nn.Mish()
     else:
         print("invalid activation function!")
         return None
 
 
-def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0):
-    layer_list.append(torch.nn.Linear(num_inputs, num_outputs))
-    if dropout > 0:
-        layer_list.append(torch.nn.Dropout(p=dropout))
-    if activation is not None:
-        layer_list.append(activation)
-
-
 def export_network(network, network_name, path, num_inputs):
     """
     Thsi function traces and exports the given network module in .pt and
@@ -72,9 +109,11 @@ def export_network(network, network_name, path, num_inputs):
     path_TS = os.path.join(path, network_name + ".pt")  # TorchScript path
     path_onnx = os.path.join(path, network_name + ".onnx")  # ONNX path
     model = copy.deepcopy(network).to("cpu")
+    model.device = "cpu"  # force all tensors created as intermediate steps to CPU
     # To trace model, must be evaluated once with arbitrary input
     model.eval()
-    dummy_input = torch.rand((1, num_inputs))
+    dummy_input = torch.rand((num_inputs))
+    # dummy_input = torch.rand((2, num_inputs))
     model_traced = torch.jit.trace(model, dummy_input)
     torch.jit.save(model_traced, path_TS)
     torch.onnx.export(model_traced, dummy_input, path_onnx)
diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py
index 5623b80c..246bafa4 100644
--- a/learning/modules/utils/normalize.py
+++ b/learning/modules/utils/normalize.py
@@ -2,31 +2,10 @@
 import torch.nn as nn
 
 
-def get_mean_var_with_masks(values, masks):
-    """
-    Applies a mask to the input values and calculates
-    the mean and variance over the valid entries, as specified
-    by the mask.
-    """
-    sum_mask = masks.sum()
-    masked_vals = values * masks
-    values_mean = masked_vals.sum() / sum_mask
-    min_sqr = (((masked_vals) ** 2) / sum_mask).sum() - (
-        (masked_vals / sum_mask).sum()
-    ) ** 2
-    values_var = min_sqr * sum_mask / (sum_mask - 1)
-    return values_mean, values_var
-
-
 class RunningMeanStd(nn.Module):
-    """
-    Keeps a running mean to normalize tensor of choice.
-    """
-
-    def __init__(self, num_items, axis=0, epsilon=1e-05):
-        super(RunningMeanStd, self).__init__()
+    def __init__(self, num_items, epsilon=1e-05):
+        super().__init__()
         self.num_items = num_items
-        self.axis = axis
         self.epsilon = epsilon
 
         self.register_buffer(
@@ -44,11 +23,6 @@ def _update_mean_var_from_moments(
         batch_var,
         batch_count,
     ):
-        """
-        Implements parallel algorithm for combining arbitrary sets A and B
-        https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-        #Parallel_algorithm
-        """
         tot_count = running_count + batch_count
         delta = batch_mean - running_mean
 
@@ -64,16 +38,10 @@ def _update_mean_var_from_moments(
         new_var = M2 / (tot_count - 1)
         return new_mean, new_var, tot_count
 
-    def forward(self, input, mask=None):
-        """
-        Returns the normalized version of the input.
-        """
+    def forward(self, input):
         if self.training:
-            if mask is not None:
-                mean, var = get_mean_var_with_masks(input, mask)
-            else:
-                mean = input.mean(self.axis)
-                var = input.var(self.axis)
+            mean = input.mean(tuple(range(input.dim() - 1)))
+            var = input.var(tuple(range(input.dim() - 1)))
             (
                 self.running_mean,
                 self.running_var,
diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py
index c0462853..1545293b 100644
--- a/learning/runners/BaseRunner.py
+++ b/learning/runners/BaseRunner.py
@@ -1,6 +1,6 @@
 import torch
-from learning.algorithms import PPO
-from learning.modules import ActorCritic
+from learning.algorithms import *  # noqa: F403
+from learning.modules import Actor, Critic, SmoothActor
 from learning.utils import remove_zero_weighted_rewards
 
 
@@ -8,33 +8,46 @@ class BaseRunner:
     def __init__(self, env, train_cfg, device="cpu"):
         self.device = device
         self.env = env
+        self.setup_reward_functions()
         self.parse_train_cfg(train_cfg)
-
-        num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"])
-        num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"])
-        num_actions = self.get_action_size(self.policy_cfg["actions"])
-        actor_critic = ActorCritic(
-            num_actor_obs, num_critic_obs, num_actions, **self.policy_cfg
-        ).to(self.device)
-
-        alg_class = eval(self.cfg["algorithm_class_name"])
-        self.alg: PPO = alg_class(actor_critic, device=self.device, **self.alg_cfg)
-
         self.num_steps_per_env = self.cfg["num_steps_per_env"]
         self.save_interval = self.cfg["save_interval"]
         self.num_learning_iterations = self.cfg["max_iterations"]
         self.tot_timesteps = 0
         self.it = 0
-
-        # * init storage and model
-        self.init_storage()
         self.log_dir = train_cfg["log_dir"]
+        self._set_up_alg()
+
+    def setup_reward_functions(self):
+        self.reward_functions = {
+            method.replace("_reward_", ""): getattr(self.env, method)
+            for method in dir(self.env)
+            if callable(getattr(self.env, method)) and method.startswith("_reward_")
+        }
+
+    def _set_up_alg(self):
+        num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
+        num_actions = self.get_action_size(self.actor_cfg["actions"])
+        num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
+
+        if self.actor_cfg["smooth_exploration"]:
+            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+        else:
+            actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+
+        critic = Critic(num_critic_obs, **self.critic_cfg)
+        alg_class = eval(self.cfg["algorithm_class_name"])
+        self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
 
     def parse_train_cfg(self, train_cfg):
         self.cfg = train_cfg["runner"]
         self.alg_cfg = train_cfg["algorithm"]
-        remove_zero_weighted_rewards(train_cfg["policy"]["reward"]["weights"])
-        self.policy_cfg = train_cfg["policy"]
+        remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"])
+        remove_zero_weighted_rewards(
+            train_cfg["critic"]["reward"]["termination_weight"]
+        )
+        self.actor_cfg = train_cfg["actor"]
+        self.critic_cfg = train_cfg["critic"]
 
     def init_storage(self):
         raise NotImplementedError
@@ -89,8 +102,8 @@ def get_rewards(self, reward_weights, modifier=1, mask=None):
         if mask is None:
             mask = 1.0
         for name, weight in reward_weights.items():
-            rewards_dict[name] = mask * self._get_reward({name: weight}, modifier)
+            rewards_dict[name] = mask * self._get_reward(name, weight * modifier)
         return rewards_dict
 
-    def _get_reward(self, name_weight, modifier=1):
-        return modifier * self.env.compute_reward(name_weight).to(self.device)
+    def _get_reward(self, name, weight):
+        return weight * self.reward_functions[name]().to(self.device)
diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py
index a24a0c9c..657b2f25 100644
--- a/learning/runners/__init__.py
+++ b/learning/runners/__init__.py
@@ -31,4 +31,9 @@
 # Copyright (c) 2021 ETH Zurich, Nikita Rudin
 
 from .on_policy_runner import OnPolicyRunner
-from .my_runner import MyRunner
\ No newline at end of file
+from .my_runner import MyRunner
+from .old_policy_runner import OldPolicyRunner
+from .off_policy_runner import OffPolicyRunner
+from .custom_critic_runner import CustomCriticRunner
+from .datalogging_runner import DataLoggingRunner
+from .psd_sac_runner import PSACRunner
\ No newline at end of file
diff --git a/learning/runners/custom_critic_runner.py b/learning/runners/custom_critic_runner.py
new file mode 100644
index 00000000..fbfb8c05
--- /dev/null
+++ b/learning/runners/custom_critic_runner.py
@@ -0,0 +1,178 @@
+import os
+import numpy as np
+import torch
+from tensordict import TensorDict
+
+from gym import LEGGED_GYM_ROOT_DIR
+from learning.algorithms import *  # noqa: F403
+from learning.modules import Actor
+from learning.utils import Logger
+
+from .on_policy_runner import OnPolicyRunner
+from learning.storage import DictStorage
+
+logger = Logger()
+storage = DictStorage()
+
+
+class CustomCriticRunner(OnPolicyRunner):
+    def __init__(self, env, train_cfg, device="cpu"):
+        super().__init__(env, train_cfg, device)
+        logger.initialize(
+            self.env.num_envs,
+            self.env.dt,
+            self.cfg["max_iterations"],
+            self.device,
+        )
+
+    def _set_up_alg(self):
+        num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
+        num_actions = self.get_action_size(self.actor_cfg["actions"])
+        num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])  # noqa: F841
+        critic_class_name = self.critic_cfg["critic_class_name"]
+        actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)")
+        alg_class = eval(self.cfg["algorithm_class_name"])
+        self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
+
+    def learn(self):
+        self.set_up_logger()
+
+        rewards_dict = {}
+
+        self.alg.switch_to_train()
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        tot_iter = self.it + self.num_learning_iterations
+        self.all_obs = torch.zeros(self.env.num_envs * (tot_iter - self.it + 1), 2)
+        self.all_obs[: self.env.num_envs, :] = actor_obs
+        self.save()
+
+        # * start up storage
+        transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
+        transition.update(
+            {
+                "actor_obs": actor_obs,
+                "actions": self.alg.act(actor_obs, critic_obs),
+                "critic_obs": critic_obs,
+                "rewards": self.get_rewards({"termination": 0.0})["termination"],
+                "dones": self.get_timed_out(),
+            }
+        )
+        storage.initialize(
+            transition,
+            self.env.num_envs,
+            self.env.num_envs * self.num_steps_per_env,
+            device=self.device,
+        )
+
+        logger.tic("runtime")
+        for self.it in range(self.it + 1, tot_iter + 1):
+            logger.tic("iteration")
+            logger.tic("collection")
+            # * Rollout
+            with torch.inference_mode():
+                for i in range(self.num_steps_per_env):
+                    actions = self.alg.act(actor_obs, critic_obs)
+                    self.set_actions(
+                        self.actor_cfg["actions"],
+                        actions,
+                        self.actor_cfg["disable_actions"],
+                    )
+
+                    transition.update(
+                        {
+                            "actor_obs": actor_obs,
+                            "actions": actions,
+                            "critic_obs": critic_obs,
+                        }
+                    )
+
+                    self.env.step()
+
+                    actor_obs = self.get_noisy_obs(
+                        self.actor_cfg["obs"], self.actor_cfg["noise"]
+                    )
+                    critic_obs = self.get_obs(self.critic_cfg["obs"])
+                    start = self.env.num_envs * self.it
+                    end = self.env.num_envs * (self.it + 1)
+                    self.all_obs[start:end, :] = actor_obs
+
+                    # * get time_outs
+                    timed_out = self.get_timed_out()
+                    terminated = self.get_terminated()
+                    dones = timed_out | terminated
+
+                    self.update_rewards(rewards_dict, terminated)
+                    total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0)
+
+                    transition.update(
+                        {
+                            "rewards": total_rewards,
+                            "timed_out": timed_out,
+                            "dones": dones,
+                        }
+                    )
+                    storage.add_transitions(transition)
+
+                    logger.log_rewards(rewards_dict)
+                    logger.log_rewards({"total_rewards": total_rewards})
+                    logger.finish_step(dones)
+            logger.toc("collection")
+
+            logger.tic("learning")
+            self.alg.update(storage.data)
+            storage.clear()
+            logger.toc("learning")
+            logger.log_all_categories()
+
+            logger.finish_iteration()
+            logger.toc("iteration")
+            logger.toc("runtime")
+            logger.print_to_terminal()
+
+            if self.it % self.save_interval == 0:
+                self.save()
+        self.all_obs = self.all_obs.detach().cpu().numpy()
+        save_path = os.path.join(
+            LEGGED_GYM_ROOT_DIR,
+            "logs",
+            "lqrc",
+            "standard_training_data.npy"
+            if "Cholesky" not in self.critic_cfg["critic_class_name"]
+            else "custom_training_data.npy",
+        )
+
+        dir_path = os.path.dirname(save_path)
+        if not os.path.exists(dir_path):
+            os.makedirs(dir_path)
+        np.save(save_path, self.all_obs)
+        print(f"Saved training observations to {save_path}")
+        self.save()
+
+    def update_rewards(self, rewards_dict, terminated):
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["termination_weight"], mask=terminated
+            )
+        )
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["weights"],
+                modifier=self.env.dt,
+                mask=~terminated,
+            )
+        )
+
+    def set_up_logger(self):
+        logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys()))
+        logger.register_rewards(
+            list(self.critic_cfg["reward"]["termination_weight"].keys())
+        )
+        logger.register_rewards(["total_rewards"])
+        logger.register_category(
+            "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"]
+        )
+        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
+
+        logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
diff --git a/learning/runners/datalogging_runner.py b/learning/runners/datalogging_runner.py
new file mode 100644
index 00000000..2f853fae
--- /dev/null
+++ b/learning/runners/datalogging_runner.py
@@ -0,0 +1,213 @@
+import os
+import torch
+from tensordict import TensorDict
+
+from learning.utils import Logger
+
+from .BaseRunner import BaseRunner
+from learning.storage import DictStorage
+from learning.utils import export_to_numpy
+
+logger = Logger()
+storage = DictStorage()
+
+
+class DataLoggingRunner(BaseRunner):
+    def __init__(self, env, train_cfg, device="cpu"):
+        super().__init__(env, train_cfg, device)
+        logger.initialize(
+            self.env.num_envs,
+            self.env.dt,
+            self.cfg["max_iterations"],
+            self.device,
+        )
+
+    def learn(self):
+        self.set_up_logger()
+
+        rewards_dict = {}
+
+        self.alg.switch_to_train()
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        tot_iter = self.it + self.num_learning_iterations
+
+        # * start up storage
+        transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
+        transition.update(
+            {
+                "actor_obs": actor_obs,
+                "next_actor_obs": actor_obs,
+                "actions": self.alg.act(actor_obs),
+                "critic_obs": critic_obs,
+                "next_critic_obs": critic_obs,
+                "rewards": self.get_rewards({"termination": 0.0})["termination"],
+                "timed_out": self.get_timed_out(),
+                "terminated": self.get_terminated(),
+                "dones": self.get_timed_out() | self.get_terminated(),
+            }
+        )
+        storage.initialize(
+            transition,
+            self.env.num_envs,
+            self.env.num_envs * self.num_steps_per_env,
+            device=self.device,
+        )
+
+        self.save()
+        # burn in observation normalization.
+        if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]:
+            self.burn_in_normalization()
+        self.env.reset()
+
+        logger.tic("runtime")
+        for self.it in range(self.it + 1, tot_iter + 1):
+            logger.tic("iteration")
+            logger.tic("collection")
+            # * Rollout
+            with torch.inference_mode():
+                for i in range(self.num_steps_per_env):
+                    actions = self.alg.act(actor_obs)
+                    self.set_actions(
+                        self.actor_cfg["actions"],
+                        actions,
+                        self.actor_cfg["disable_actions"],
+                    )
+
+                    transition.update(
+                        {
+                            "actor_obs": actor_obs,
+                            "actions": actions,
+                            "critic_obs": critic_obs,
+                        }
+                    )
+
+                    self.env.step()
+
+                    actor_obs = self.get_noisy_obs(
+                        self.actor_cfg["obs"], self.actor_cfg["noise"]
+                    )
+                    critic_obs = self.get_obs(self.critic_cfg["obs"])
+
+                    # * get time_outs
+                    timed_out = self.get_timed_out()
+                    terminated = self.get_terminated()
+                    dones = timed_out | terminated
+
+                    self.update_rewards(rewards_dict, terminated)
+                    total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0)
+
+                    transition.update(
+                        {
+                            "next_actor_obs": actor_obs,
+                            "next_critic_obs": critic_obs,
+                            "rewards": total_rewards,
+                            "timed_out": timed_out,
+                            "terminated": terminated,
+                            "dones": dones,
+                        }
+                    )
+                    storage.add_transitions(transition)
+
+                    logger.log_rewards(rewards_dict)
+                    logger.log_rewards({"total_rewards": total_rewards})
+                    logger.finish_step(dones)
+            logger.toc("collection")
+
+            logger.tic("learning")
+            self.save()
+            self.alg.update(storage.data)
+            storage.clear()
+            logger.toc("learning")
+            logger.log_all_categories()
+
+            logger.finish_iteration()
+            logger.toc("iteration")
+            logger.toc("runtime")
+            logger.print_to_terminal()
+
+            # if self.it % self.save_interval == 0:
+            #     self.save()
+        # self.save()
+        # print(f"Saved data from LQR Pendulum run to {dict_save_path}")
+
+    @torch.no_grad
+    def burn_in_normalization(self, n_iterations=100):
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        for _ in range(n_iterations):
+            actions = self.alg.act(actor_obs)
+            self.set_actions(self.actor_cfg["actions"], actions)
+            self.env.step()
+            actor_obs = self.get_noisy_obs(
+                self.actor_cfg["obs"], self.actor_cfg["noise"]
+            )
+            critic_obs = self.get_obs(self.critic_cfg["obs"])
+            self.alg.critic.evaluate(critic_obs)
+        self.env.reset()
+
+    def update_rewards(self, rewards_dict, terminated):
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["termination_weight"], mask=terminated
+            )
+        )
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["weights"],
+                modifier=self.env.dt,
+                mask=~terminated,
+            )
+        )
+
+    def set_up_logger(self):
+        logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys()))
+        logger.register_rewards(
+            list(self.critic_cfg["reward"]["termination_weight"].keys())
+        )
+        logger.register_rewards(["total_rewards"])
+        logger.register_category(
+            "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"]
+        )
+        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
+
+        logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
+
+    def save(self):
+        os.makedirs(self.log_dir, exist_ok=True)
+        path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
+        torch.save(
+            {
+                "actor_state_dict": self.alg.actor.state_dict(),
+                "critic_state_dict": self.alg.critic.state_dict(),
+                "optimizer_state_dict": self.alg.optimizer.state_dict(),
+                "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(),
+                "iter": self.it,
+            },
+            path,
+        )
+        path_data = os.path.join(self.log_dir, "data_{}".format(self.it))
+        torch.save(storage.data.cpu(), path_data + ".pt")
+        export_to_numpy(storage.data, path_data + ".npz")
+
+    def load(self, path, load_optimizer=True):
+        loaded_dict = torch.load(path, weights_only=True)
+        self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"])
+        self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"])
+        if load_optimizer:
+            self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
+            self.alg.critic_optimizer.load_state_dict(
+                loaded_dict["critic_optimizer_state_dict"]
+            )
+        self.it = loaded_dict["iter"]
+
+    def switch_to_eval(self):
+        self.alg.actor.eval()
+        self.alg.critic.eval()
+
+    def get_inference_actions(self):
+        obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"])
+        return self.alg.actor.act_inference(obs)
+
+    def export(self, path):
+        self.alg.actor.export(path)
diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py
index c3b0b477..e0da1a1b 100644
--- a/learning/runners/my_runner.py
+++ b/learning/runners/my_runner.py
@@ -1,81 +1,135 @@
 import torch
+from tensordict import TensorDict
 
 from learning.utils import Logger
-from learning.utils import PotentialBasedRewardShaping
-from learning.utils import remove_zero_weighted_rewards
-
 from .on_policy_runner import OnPolicyRunner
+from learning.storage import DictStorage
+from learning.algorithms import PPO2  # noqa F401
+from learning.modules.actor import Actor
+from learning.modules.critic import Critic  # noqa F401
+from learning.modules.QRCritics import *  # noqa F401
 
 logger = Logger()
+storage = DictStorage()
 
 
 class MyRunner(OnPolicyRunner):
     def __init__(self, env, train_cfg, device="cpu"):
         super().__init__(env, train_cfg, device)
-        logger.initialize(
-            self.env.num_envs,
-            self.env.dt,
-            self.cfg["max_iterations"],
-            self.device,
-        )
 
-    def learn(self):
-        self.set_up_logger()
-        remove_zero_weighted_rewards(self.policy_cfg["reward"]["pbrs_weights"])
-        PBRS = PotentialBasedRewardShaping(
-            self.policy_cfg["reward"]["pbrs_weights"], self.device
-        )
-        logger.register_rewards(PBRS.get_reward_keys())
+    def _set_up_alg(self):
+        num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
+        num_actions = self.get_action_size(self.actor_cfg["actions"])
+        num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])  # noqa: F841
+        actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        critic_class_name = self.critic_cfg["critic_class_name"]
+        critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)")
+        alg_class = eval(self.cfg["algorithm_class_name"])
+        self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
 
-        rewards_dict = {}
+    def learn(self, states_to_log_dict=None):
+        n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"])
+        assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq"
+        self.set_up_logger(dt=self.env.dt * n_policy_steps)
 
-        self.alg.actor_critic.train()
-        actor_obs = self.get_obs(self.policy_cfg["actor_obs"])
-        critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
-        tot_iter = self.it + self.num_learning_iterations
+        rewards_dict = self.initialize_rewards_dict(n_policy_steps)
 
+        self.alg.switch_to_train()
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        tot_iter = self.it + self.num_learning_iterations
         self.save()
 
+        # * start up storage
+        transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
+        transition.update(
+            {
+                "actor_obs": actor_obs,
+                "next_actor_obs": actor_obs,
+                "actions": self.alg.act(actor_obs),
+                "critic_obs": critic_obs,
+                "next_critic_obs": critic_obs,
+                "rewards": self.get_rewards({"termination": 0.0})["termination"],
+                "timed_out": self.get_timed_out(),
+                "terminated": self.get_terminated(),
+                "dones": self.get_timed_out() | self.get_terminated(),
+            }
+        )
+        storage.initialize(
+            transition,
+            self.env.num_envs,
+            self.env.num_envs * self.num_steps_per_env,
+            device=self.device,
+        )
+
+        # burn in observation normalization.
+        self.burn_in_normalization()
+
         logger.tic("runtime")
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
+
+            # * Simulate environment and log states
+            if states_to_log_dict is not None:
+                it_idx = self.it - 1
+                if it_idx % 10 == 0:
+                    self.sim_and_log_states(states_to_log_dict, it_idx)
+
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
-                    actions = self.alg.act(actor_obs, critic_obs)
+                    actions = self.alg.act(actor_obs)
                     self.set_actions(
-                        self.policy_cfg["actions"],
+                        self.actor_cfg["actions"],
                         actions,
-                        self.policy_cfg["disable_actions"],
+                        self.actor_cfg["disable_actions"],
                     )
 
-                    PBRS.pre_step(self.env)
-                    self.env.step()
+                    transition.update(
+                        {
+                            "actor_obs": actor_obs,
+                            "actions": actions,
+                            "critic_obs": critic_obs,
+                        }
+                    )
+                    for step in range(n_policy_steps):
+                        self.env.step()
+                        # put reward integration here
+                        self.update_rewards_dict(rewards_dict, step)
+                    else:
+                        # catch and reset failed envs
+                        to_be_reset = self.env.timed_out | self.env.terminated
+                        env_ids = (to_be_reset).nonzero(as_tuple=False).flatten()
+                        self.env._reset_idx(env_ids)
 
+                    total_rewards = torch.stack(
+                        tuple(rewards_dict.sum(dim=0).values())
+                    ).sum(dim=(0))
                     actor_obs = self.get_noisy_obs(
-                        self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
+                        self.actor_cfg["obs"], self.actor_cfg["noise"]
                     )
-                    critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
-                    # * get time_outs
-                    timed_out = self.get_timed_out()
-                    terminated = self.get_terminated()
-                    dones = timed_out | terminated
+                    critic_obs = self.get_obs(self.critic_cfg["obs"])
 
-                    self.update_rewards(rewards_dict, terminated)
-                    rewards_dict.update(PBRS.post_step(self.env, dones))
-                    total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0)
+                    transition.update(
+                        {
+                            "next_actor_obs": actor_obs,
+                            "next_critic_obs": critic_obs,
+                            "rewards": total_rewards,
+                            "timed_out": self.env.timed_out,
+                            "dones": self.env.timed_out | self.env.terminated,
+                        }
+                    )
+                    storage.add_transitions(transition)
 
-                    logger.log_rewards(rewards_dict)
+                    logger.log_rewards(rewards_dict.sum(dim=0))
                     logger.log_rewards({"total_rewards": total_rewards})
-                    logger.finish_step(dones)
-
-                    self.alg.process_env_step(total_rewards, dones, timed_out)
-                self.alg.compute_returns(critic_obs)
+                    logger.finish_step(self.env.timed_out | self.env.terminated)
             logger.toc("collection")
 
             logger.tic("learning")
-            self.alg.update()
+            self.alg.update(storage.data)
+            storage.clear()
             logger.toc("learning")
             logger.log_all_categories()
 
@@ -88,33 +142,118 @@ def learn(self):
                 self.save()
         self.save()
 
+    @torch.no_grad
+    def burn_in_normalization(self, n_iterations=100):
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        for _ in range(n_iterations):
+            actions = self.alg.act(actor_obs)
+            self.set_actions(self.actor_cfg["actions"], actions)
+            self.env.step()
+            actor_obs = self.get_noisy_obs(
+                self.actor_cfg["obs"], self.actor_cfg["noise"]
+            )
+            critic_obs = self.get_obs(self.critic_cfg["obs"])
+            self.alg.critic.evaluate(critic_obs)
+        self.env.reset()
+
+    def set_up_logger(self, dt=None):
+        if dt is None:
+            dt = self.env.dt
+        logger.initialize(
+            self.env.num_envs, dt, self.cfg["max_iterations"], self.device
+        )
+
+        logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys()))
+        logger.register_rewards(
+            list(self.critic_cfg["reward"]["termination_weight"].keys())
+        )
+        logger.register_rewards(["total_rewards"])
+        logger.register_category(
+            "algorithm",
+            self.alg,
+            ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
+        )
+        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
+        logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
+
     def update_rewards(self, rewards_dict, terminated):
+        # sum existing rewards with new rewards
+
         rewards_dict.update(
             self.get_rewards(
-                self.policy_cfg["reward"]["termination_weight"], mask=terminated
+                self.critic_cfg["reward"]["termination_weight"], mask=terminated
             )
         )
         rewards_dict.update(
             self.get_rewards(
-                self.policy_cfg["reward"]["weights"],
+                self.critic_cfg["reward"]["weights"],
                 modifier=self.env.dt,
                 mask=~terminated,
             )
         )
 
-    def set_up_logger(self):
-        logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys()))
-        logger.register_rewards(
-            list(self.policy_cfg["reward"]["termination_weight"].keys())
+    def initialize_rewards_dict(self, n_steps):
+        # sum existing rewards with new rewards
+        rewards_dict = TensorDict(
+            {}, batch_size=(n_steps, self.env.num_envs), device=self.device
         )
-        logger.register_rewards(["total_rewards"])
-        logger.register_category(
-            "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"]
+        for key in self.critic_cfg["reward"]["termination_weight"]:
+            rewards_dict.update(
+                {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)}
+            )
+        for key in self.critic_cfg["reward"]["weights"]:
+            rewards_dict.update(
+                {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)}
+            )
+        return rewards_dict
+
+    def update_rewards_dict(self, rewards_dict, step):
+        # sum existing rewards with new rewards
+        rewards_dict[step].update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["termination_weight"],
+                modifier=self.env.dt,
+                mask=self.env.terminated,
+            ),
+            inplace=True,
         )
-        logger.register_category(
-            "actor", self.alg.actor_critic, ["action_std", "entropy"]
+        rewards_dict[step].update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["weights"],
+                modifier=self.env.dt,
+                mask=~self.env.terminated,
+            ),
+            inplace=True,
         )
 
-        logger.attach_torch_obj_to_wandb(
-            (self.alg.actor_critic.actor, self.alg.actor_critic.critic)
-        )
+    def sim_and_log_states(self, states_to_log_dict, it_idx):
+        # Simulate environment for as many steps as expected in the dict.
+        # Log states to the dict, as well as whether the env terminated.
+        steps = states_to_log_dict["terminated"].shape[2]
+        actor_obs = self.get_obs(self.policy_cfg["actor_obs"])
+
+        with torch.inference_mode():
+            for i in range(steps):
+                actions = self.alg.act(actor_obs)
+                self.set_actions(
+                    self.policy_cfg["actions"],
+                    actions,
+                    self.policy_cfg["disable_actions"],
+                )
+
+                self.env.step()
+
+                actor_obs = self.get_noisy_obs(
+                    self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
+                )
+
+                # Log states (just for the first env)
+                terminated = self.get_terminated()[0]
+                for state in states_to_log_dict:
+                    if state == "terminated":
+                        states_to_log_dict[state][0, it_idx, i, :] = terminated
+                    else:
+                        states_to_log_dict[state][0, it_idx, i, :] = getattr(
+                            self.env, state
+                        )[0, :]
diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py
new file mode 100644
index 00000000..83249aef
--- /dev/null
+++ b/learning/runners/off_policy_runner.py
@@ -0,0 +1,318 @@
+import os
+import torch
+from tensordict import TensorDict
+
+from learning.utils import Logger
+
+from .BaseRunner import BaseRunner
+from learning.modules import Critic, ChimeraActor
+from learning.storage import ReplayBuffer
+from learning.algorithms import SAC
+
+logger = Logger()
+storage = ReplayBuffer()
+
+
+class OffPolicyRunner(BaseRunner):
+    def __init__(self, env, train_cfg, device="cpu"):
+        super().__init__(env, train_cfg, device)
+
+    def _set_up_alg(self):
+        num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
+        num_actions = self.get_action_size(self.actor_cfg["actions"])
+        num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
+        actor = ChimeraActor(num_actor_obs, num_actions, **self.actor_cfg)
+        critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+        critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+        target_critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+        target_critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+
+        print(actor)
+
+        self.alg = SAC(
+            actor,
+            critic_1,
+            critic_2,
+            target_critic_1,
+            target_critic_2,
+            device=self.device,
+            **self.alg_cfg,
+        )
+
+    def learn(self):
+        n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"])
+        assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq"
+        self.set_up_logger(dt=self.env.dt * n_policy_steps)
+
+        rewards_dict = self.initialize_rewards_dict(n_policy_steps)
+
+        self.alg.switch_to_train()
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        actions = self.alg.act(actor_obs)
+        tot_iter = self.it + self.num_learning_iterations
+        self.save()
+
+        # * start up storage
+        transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
+        transition.update(
+            {
+                "actor_obs": actor_obs,
+                "next_actor_obs": actor_obs,
+                "actions": self.alg.act(actor_obs),
+                "critic_obs": critic_obs,
+                "next_critic_obs": critic_obs,
+                "rewards": self.get_rewards({"termination": 0.0})["termination"],
+                "dones": self.get_timed_out(),
+            }
+        )
+        storage.initialize(
+            transition,
+            self.env.num_envs,
+            self.alg_cfg["storage_size"],
+            device=self.device,
+        )
+
+        # fill buffer
+        for _ in range(self.alg_cfg["initial_fill"]):
+            with torch.inference_mode():
+                actions = torch.rand_like(actions) * 2 - 1
+                self.set_actions(
+                    self.actor_cfg["actions"],
+                    actions,
+                    self.actor_cfg["disable_actions"],
+                )
+                transition.update(
+                    {
+                        "actor_obs": actor_obs,
+                        "actions": actions,
+                        "critic_obs": critic_obs,
+                    }
+                )
+
+                for step in range(n_policy_steps):
+                    self.env.step()
+                    # put reward integration here
+                    self.update_rewards_dict(rewards_dict, step)
+                else:
+                    # catch and reset failed envs
+                    to_be_reset = self.env.timed_out | self.env.terminated
+                    env_ids = (to_be_reset).nonzero(as_tuple=False).flatten()
+                    self.env._reset_idx(env_ids)
+
+                total_rewards = torch.stack(
+                    tuple(rewards_dict.sum(dim=0).values())
+                ).sum(dim=(0))
+
+                actor_obs = self.get_noisy_obs(
+                    self.actor_cfg["obs"], self.actor_cfg["noise"]
+                )
+                critic_obs = self.get_obs(self.critic_cfg["obs"])
+
+                transition.update(
+                    {
+                        "next_actor_obs": actor_obs,
+                        "next_critic_obs": critic_obs,
+                        "rewards": total_rewards,
+                        "timed_out": self.env.timed_out,
+                        "dones": self.env.timed_out | self.env.terminated,
+                    }
+                )
+                storage.add_transitions(transition)
+                # print every 10% of initial fill
+                if (self.alg_cfg["initial_fill"] > 10) and (
+                    _ % (self.alg_cfg["initial_fill"] // 10) == 0
+                ):
+                    print(f"Filled {100 * _ / self.alg_cfg['initial_fill']}%")
+
+        logger.tic("runtime")
+        for self.it in range(self.it + 1, tot_iter + 1):
+            logger.tic("iteration")
+            logger.tic("collection")
+            # * Rollout
+            with torch.inference_mode():
+                for i in range(self.num_steps_per_env):
+                    actions = self.alg.act(actor_obs)
+                    self.set_actions(
+                        self.actor_cfg["actions"],
+                        actions,
+                        self.actor_cfg["disable_actions"],
+                    )
+
+                    transition.update(
+                        {
+                            "actor_obs": actor_obs,
+                            "actions": actions,
+                            "critic_obs": critic_obs,
+                        }
+                    )
+
+                    for step in range(n_policy_steps):
+                        self.env.step()
+                        # put reward integration here
+                        self.update_rewards_dict(rewards_dict, step)
+                    else:
+                        # catch and reset failed envs
+                        to_be_reset = self.env.timed_out | self.env.terminated
+                        env_ids = (to_be_reset).nonzero(as_tuple=False).flatten()
+                        self.env._reset_idx(env_ids)
+
+                    total_rewards = torch.stack(
+                        tuple(rewards_dict.sum(dim=0).values())
+                    ).sum(dim=(0))
+
+                    actor_obs = self.get_noisy_obs(
+                        self.actor_cfg["obs"], self.actor_cfg["noise"]
+                    )
+                    critic_obs = self.get_obs(self.critic_cfg["obs"])
+
+                    transition.update(
+                        {
+                            "next_actor_obs": actor_obs,
+                            "next_critic_obs": critic_obs,
+                            "rewards": total_rewards,
+                            "timed_out": self.env.timed_out,
+                            "dones": self.env.timed_out | self.env.terminated,
+                        }
+                    )
+                    storage.add_transitions(transition)
+
+                    logger.log_rewards(rewards_dict.sum(dim=0))
+                    logger.log_rewards({"total_rewards": total_rewards})
+                    logger.finish_step(self.env.timed_out | self.env.terminated)
+            logger.toc("collection")
+
+            logger.tic("learning")
+            self.alg.update(storage.get_data())
+            logger.toc("learning")
+            logger.log_all_categories()
+
+            logger.finish_iteration()
+            logger.toc("iteration")
+            logger.toc("runtime")
+            logger.print_to_terminal()
+
+            if self.it % self.save_interval == 0:
+                self.save()
+        self.save()
+
+    def update_rewards_dict(self, rewards_dict, step):
+        # sum existing rewards with new rewards
+        rewards_dict[step].update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["termination_weight"],
+                modifier=self.env.dt,
+                mask=self.env.terminated,
+            ),
+            inplace=True,
+        )
+        rewards_dict[step].update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["weights"],
+                modifier=self.env.dt,
+                mask=~self.env.terminated,
+            ),
+            inplace=True,
+        )
+
+    def initialize_rewards_dict(self, n_steps):
+        # sum existing rewards with new rewards
+        rewards_dict = TensorDict(
+            {}, batch_size=(n_steps, self.env.num_envs), device=self.device
+        )
+        for key in self.critic_cfg["reward"]["termination_weight"]:
+            rewards_dict.update(
+                {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)}
+            )
+        for key in self.critic_cfg["reward"]["weights"]:
+            rewards_dict.update(
+                {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)}
+            )
+        return rewards_dict
+
+    def set_up_logger(self, dt=None):
+        if dt is None:
+            dt = self.env.dt
+        logger.initialize(
+            self.env.num_envs, dt, self.cfg["max_iterations"], self.device
+        )
+
+        logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys()))
+        logger.register_rewards(
+            list(self.critic_cfg["reward"]["termination_weight"].keys())
+        )
+        logger.register_rewards(["total_rewards"])
+        logger.register_category(
+            "algorithm",
+            self.alg,
+            [
+                "mean_critic_1_loss",
+                "mean_critic_2_loss",
+                "mean_actor_loss",
+                "mean_alpha_loss",
+                "alpha",
+                "alpha_lr",
+                "critic_1_lr",
+                "critic_2_lr",
+                "actor_lr",
+            ],
+        )
+        # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
+
+        logger.attach_torch_obj_to_wandb(
+            (self.alg.actor, self.alg.critic_1, self.alg.critic_2)
+        )
+
+    def save(self):
+        os.makedirs(self.log_dir, exist_ok=True)
+        path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
+        save_dict = {
+            "actor_state_dict": self.alg.actor.state_dict(),
+            "critic_1_state_dict": self.alg.critic_1.state_dict(),
+            "critic_2_state_dict": self.alg.critic_2.state_dict(),
+            "log_alpha": self.alg.log_alpha,
+            "actor_optimizer_state_dict": self.alg.actor_optimizer.state_dict(),
+            "critic_1_optimizer_state_dict": self.alg.critic_1_optimizer.state_dict(),
+            "critic_2_optimizer_state_dict": self.alg.critic_2_optimizer.state_dict(),
+            "log_alpha_optimizer_state_dict": self.alg.log_alpha_optimizer.state_dict(),
+            "iter": self.it,
+        }
+        torch.save(save_dict, path)
+
+    def load(self, path, load_optimizer=True):
+        loaded_dict = torch.load(path, weights_only=True)
+        self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"])
+        self.alg.critic_1.load_state_dict(loaded_dict["critic_1_state_dict"])
+        self.alg.critic_2.load_state_dict(loaded_dict["critic_2_state_dict"])
+        self.log_alpha = loaded_dict["log_alpha"]
+        if load_optimizer:
+            self.alg.actor_optimizer.load_state_dict(
+                loaded_dict["actor_optimizer_state_dict"]
+            )
+            self.alg.critic_1_optimizer.load_state_dict(
+                loaded_dict["critic_1_optimizer_state_dict"]
+            )
+            self.alg.critic_2_optimizer.load_state_dict(
+                loaded_dict["critic_2_optimizer_state_dict"]
+            )
+            self.alg.log_alpha_optimizer.load_state_dict(
+                loaded_dict["log_alpha_optimizer_state_dict"]
+            )
+        self.it = loaded_dict["iter"]
+
+    def switch_to_eval(self):
+        self.alg.actor.eval()
+        self.alg.critic_1.eval()
+        self.alg.critic_2.eval()
+
+    def get_inference_actions(self):
+        obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"])
+        mean = self.alg.actor.forward(obs)
+        actions = torch.tanh(mean)
+        actions = (actions * self.alg.action_delta + self.alg.action_offset).clamp(
+            self.alg.action_min, self.alg.action_max
+        )
+        return actions
+
+    def export(self, path):
+        self.alg.actor.export(path)
diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py
new file mode 100644
index 00000000..d6fe5ddf
--- /dev/null
+++ b/learning/runners/old_policy_runner.py
@@ -0,0 +1,164 @@
+import os
+import torch
+
+from learning.utils import Logger
+from .BaseRunner import BaseRunner
+from learning.algorithms import PPO  # noqa: F401
+from learning.modules import ActorCritic, Actor, Critic
+
+logger = Logger()
+
+
+class OldPolicyRunner(BaseRunner):
+    def __init__(self, env, train_cfg, device="cpu"):
+        super().__init__(env, train_cfg, device)
+        self.init_storage()
+        logger.initialize(
+            self.env.num_envs,
+            self.env.dt,
+            self.cfg["max_iterations"],
+            self.device,
+        )
+
+    def _set_up_alg(self):
+        num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
+        num_actions = self.get_action_size(self.actor_cfg["actions"])
+        num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
+        actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        critic = Critic(num_critic_obs, **self.critic_cfg)
+        actor_critic = ActorCritic(actor, critic)
+        alg_class = eval(self.cfg["algorithm_class_name"])
+        self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg)
+
+    def learn(self):
+        self.set_up_logger()
+
+        rewards_dict = {}
+
+        self.alg.actor_critic.train()
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        tot_iter = self.it + self.num_learning_iterations
+
+        self.save()
+
+        logger.tic("runtime")
+        for self.it in range(self.it + 1, tot_iter + 1):
+            logger.tic("iteration")
+            logger.tic("collection")
+            # * Rollout
+            with torch.inference_mode():
+                for i in range(self.num_steps_per_env):
+                    actions = self.alg.act(actor_obs, critic_obs)
+                    self.set_actions(
+                        self.actor_cfg["actions"],
+                        actions,
+                        self.actor_cfg["disable_actions"],
+                    )
+
+                    self.env.step()
+
+                    actor_obs = self.get_noisy_obs(
+                        self.actor_cfg["obs"], self.actor_cfg["noise"]
+                    )
+                    critic_obs = self.get_obs(self.critic_cfg["obs"])
+                    # * get time_outs
+                    timed_out = self.get_timed_out()
+                    terminated = self.get_terminated()
+                    dones = timed_out | terminated
+
+                    self.update_rewards(rewards_dict, terminated)
+                    total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0)
+
+                    logger.log_rewards(rewards_dict)
+                    logger.log_rewards({"total_rewards": total_rewards})
+                    logger.finish_step(dones)
+
+                    self.alg.process_env_step(total_rewards, dones, timed_out)
+                self.alg.compute_returns(critic_obs)
+            logger.toc("collection")
+
+            logger.tic("learning")
+            self.alg.update()
+            logger.toc("learning")
+            logger.log_all_categories()
+
+            logger.finish_iteration()
+            logger.toc("iteration")
+            logger.toc("runtime")
+            logger.print_to_terminal()
+
+            if self.it % self.save_interval == 0:
+                self.save()
+        self.save()
+
+    def update_rewards(self, rewards_dict, terminated):
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["termination_weight"], mask=terminated
+            )
+        )
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["weights"],
+                modifier=self.env.dt,
+                mask=~terminated,
+            )
+        )
+
+    def set_up_logger(self):
+        logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys()))
+        logger.register_rewards(
+            list(self.critic_cfg["reward"]["termination_weight"].keys())
+        )
+        logger.register_rewards(["total_rewards"])
+        logger.register_category(
+            "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"]
+        )
+
+        logger.attach_torch_obj_to_wandb(
+            (self.alg.actor_critic.actor, self.alg.actor_critic.critic)
+        )
+
+    def save(self):
+        os.makedirs(self.log_dir, exist_ok=True)
+        path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
+        torch.save(
+            {
+                "actor_state_dict": self.alg.actor_critic.actor.state_dict(),
+                "critic_state_dict": self.alg.actor_critic.critic.state_dict(),
+                "optimizer_state_dict": self.alg.optimizer.state_dict(),
+                "iter": self.it,
+            },
+            path,
+        )
+
+    def load(self, path, load_optimizer=True):
+        loaded_dict = torch.load(path, weights_only=True)
+        self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"])
+        self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"])
+        if load_optimizer:
+            self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
+        self.it = loaded_dict["iter"]
+
+    def switch_to_eval(self):
+        self.alg.actor_critic.eval()
+
+    def get_inference_actions(self):
+        obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"])
+        return self.alg.actor_critic.actor.act_inference(obs)
+
+    def export(self, path):
+        self.alg.actor_critic.export_policy(path)
+
+    def init_storage(self):
+        num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
+        num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
+        num_actions = self.get_action_size(self.actor_cfg["actions"])
+        self.alg.init_storage(
+            self.env.num_envs,
+            self.num_steps_per_env,
+            actor_obs_shape=[num_actor_obs],
+            critic_obs_shape=[num_critic_obs],
+            action_shape=[num_actions],
+        )
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 3a40974a..4831b615 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -1,73 +1,125 @@
 import os
 import torch
+from tensordict import TensorDict
 
 from learning.utils import Logger
 
 from .BaseRunner import BaseRunner
+from learning.storage import DictStorage
 
 logger = Logger()
+storage = DictStorage()
 
 
 class OnPolicyRunner(BaseRunner):
     def __init__(self, env, train_cfg, device="cpu"):
         super().__init__(env, train_cfg, device)
-        logger.initialize(
-            self.env.num_envs,
-            self.env.dt,
-            self.cfg["max_iterations"],
-            self.device,
-        )
 
-    def learn(self):
-        self.set_up_logger()
+    def learn(self, states_to_log_dict=None):
+        n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"])
+        assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq"
+        self.set_up_logger(dt=self.env.dt * n_policy_steps)
 
-        rewards_dict = {}
+        rewards_dict = self.initialize_rewards_dict(n_policy_steps)
 
-        self.alg.actor_critic.train()
-        actor_obs = self.get_obs(self.policy_cfg["actor_obs"])
-        critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
+        self.alg.switch_to_train()
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
         tot_iter = self.it + self.num_learning_iterations
-
         self.save()
 
+        # * start up storage
+        transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
+        transition.update(
+            {
+                "actor_obs": actor_obs,
+                "next_actor_obs": actor_obs,
+                "actions": self.alg.act(actor_obs),
+                "critic_obs": critic_obs,
+                "next_critic_obs": critic_obs,
+                "rewards": self.get_rewards({"termination": 0.0})["termination"],
+                "timed_out": self.get_timed_out(),
+                "terminated": self.get_terminated(),
+                "dones": self.get_timed_out() | self.get_terminated(),
+            }
+        )
+        storage.initialize(
+            transition,
+            self.env.num_envs,
+            self.env.num_envs * self.num_steps_per_env,
+            device=self.device,
+        )
+
+        # burn in observation normalization.
+        if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]:
+            self.burn_in_normalization()
+
         logger.tic("runtime")
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
+
+            # * Simulate environment and log states
+            if states_to_log_dict is not None:
+                it_idx = self.it - 1
+                if it_idx % 10 == 0:
+                    self.sim_and_log_states(states_to_log_dict, it_idx)
+
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
-                    actions = self.alg.act(actor_obs, critic_obs)
+                    actions = self.alg.act(actor_obs)
                     self.set_actions(
-                        self.policy_cfg["actions"],
+                        self.actor_cfg["actions"],
                         actions,
-                        self.policy_cfg["disable_actions"],
+                        self.actor_cfg["disable_actions"],
+                    )
+
+                    transition.update(
+                        {
+                            "actor_obs": actor_obs,
+                            "actions": actions,
+                            "critic_obs": critic_obs,
+                        }
                     )
+                    for step in range(n_policy_steps):
+                        self.env.step()
+                        # put reward integration here
+                        self.update_rewards_dict(rewards_dict, step)
+                    else:
+                        # catch and reset failed envs
+                        to_be_reset = self.env.timed_out | self.env.terminated
+                        env_ids = (to_be_reset).nonzero(as_tuple=False).flatten()
+                        self.env._reset_idx(env_ids)
 
-                    self.env.step()
+                    total_rewards = torch.stack(
+                        tuple(rewards_dict.sum(dim=0).values())
+                    ).sum(dim=(0))
 
                     actor_obs = self.get_noisy_obs(
-                        self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
+                        self.actor_cfg["obs"], self.actor_cfg["noise"]
                     )
-                    critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
-                    # * get time_outs
-                    timed_out = self.get_timed_out()
-                    terminated = self.get_terminated()
-                    dones = timed_out | terminated
+                    critic_obs = self.get_obs(self.critic_cfg["obs"])
 
-                    self.update_rewards(rewards_dict, terminated)
-                    total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0)
+                    transition.update(
+                        {
+                            "next_actor_obs": actor_obs,
+                            "next_critic_obs": critic_obs,
+                            "rewards": total_rewards,
+                            "timed_out": self.env.timed_out,
+                            "dones": self.env.timed_out | self.env.terminated,
+                        }
+                    )
+                    storage.add_transitions(transition)
 
-                    logger.log_rewards(rewards_dict)
+                    logger.log_rewards(rewards_dict.sum(dim=0))
                     logger.log_rewards({"total_rewards": total_rewards})
-                    logger.finish_step(dones)
-
-                    self.alg.process_env_step(total_rewards, dones, timed_out)
-                self.alg.compute_returns(critic_obs)
+                    logger.finish_step(self.env.timed_out | self.env.terminated)
             logger.toc("collection")
 
             logger.tic("learning")
-            self.alg.update()
+            self.alg.update(storage.data)
+            storage.clear()
             logger.toc("learning")
             logger.log_all_categories()
 
@@ -80,76 +132,139 @@ def learn(self):
                 self.save()
         self.save()
 
-    def update_rewards(self, rewards_dict, terminated):
-        rewards_dict.update(
-            self.get_rewards(
-                self.policy_cfg["reward"]["termination_weight"], mask=terminated
+    @torch.no_grad
+    def burn_in_normalization(self, n_iterations=100):
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        for _ in range(n_iterations):
+            actions = self.alg.act(actor_obs)
+            self.set_actions(self.actor_cfg["actions"], actions)
+            self.env.step()
+            actor_obs = self.get_noisy_obs(
+                self.actor_cfg["obs"], self.actor_cfg["noise"]
             )
+            critic_obs = self.get_obs(self.critic_cfg["obs"])
+            self.alg.critic.evaluate(critic_obs)
+        self.env.reset()
+
+    def update_rewards_dict(self, rewards_dict, step):
+        # sum existing rewards with new rewards
+        rewards_dict[step].update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["termination_weight"],
+                modifier=self.env.dt,
+                mask=self.env.terminated,
+            ),
+            inplace=True,
         )
-        rewards_dict.update(
+        rewards_dict[step].update(
             self.get_rewards(
-                self.policy_cfg["reward"]["weights"],
+                self.critic_cfg["reward"]["weights"],
                 modifier=self.env.dt,
-                mask=~terminated,
+                mask=~self.env.terminated,
+            ),
+            inplace=True,
+        )
+
+    def initialize_rewards_dict(self, n_steps):
+        # sum existing rewards with new rewards
+        rewards_dict = TensorDict(
+            {}, batch_size=(n_steps, self.env.num_envs), device=self.device
+        )
+        for key in self.critic_cfg["reward"]["termination_weight"]:
+            rewards_dict.update(
+                {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)}
             )
+        for key in self.critic_cfg["reward"]["weights"]:
+            rewards_dict.update(
+                {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)}
+            )
+        return rewards_dict
+
+    def set_up_logger(self, dt=None):
+        if dt is None:
+            dt = self.env.dt
+        logger.initialize(
+            self.env.num_envs, dt, self.cfg["max_iterations"], self.device
         )
 
-    def set_up_logger(self):
-        logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys()))
+        logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys()))
         logger.register_rewards(
-            list(self.policy_cfg["reward"]["termination_weight"].keys())
+            list(self.critic_cfg["reward"]["termination_weight"].keys())
         )
         logger.register_rewards(["total_rewards"])
         logger.register_category(
-            "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"]
-        )
-        logger.register_category(
-            "actor", self.alg.actor_critic, ["action_std", "entropy"]
+            "algorithm",
+            self.alg,
+            ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
         )
+        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
 
-        logger.attach_torch_obj_to_wandb(
-            (self.alg.actor_critic.actor, self.alg.actor_critic.critic)
-        )
+        logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 
     def save(self):
         os.makedirs(self.log_dir, exist_ok=True)
         path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
         torch.save(
             {
-                "actor_state_dict": self.alg.actor_critic.actor.state_dict(),
-                "critic_state_dict": self.alg.actor_critic.critic.state_dict(),
+                "actor_state_dict": self.alg.actor.state_dict(),
+                "critic_state_dict": self.alg.critic.state_dict(),
                 "optimizer_state_dict": self.alg.optimizer.state_dict(),
+                "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(),
                 "iter": self.it,
             },
             path,
         )
 
     def load(self, path, load_optimizer=True):
-        loaded_dict = torch.load(path)
-        self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"])
-        self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"])
+        loaded_dict = torch.load(path, weights_only=True)
+        self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"])
+        self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"])
         if load_optimizer:
             self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
+            self.alg.critic_optimizer.load_state_dict(
+                loaded_dict["critic_optimizer_state_dict"]
+            )
         self.it = loaded_dict["iter"]
 
     def switch_to_eval(self):
-        self.alg.actor_critic.eval()
+        self.alg.actor.eval()
+        self.alg.critic.eval()
 
     def get_inference_actions(self):
-        obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"])
-        return self.alg.actor_critic.actor.act_inference(obs)
+        obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"])
+        return self.alg.actor.act_inference(obs)
 
     def export(self, path):
-        self.alg.actor_critic.export_policy(path)
+        self.alg.actor.export(path)
 
-    def init_storage(self):
-        num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"])
-        num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"])
-        num_actions = self.get_action_size(self.policy_cfg["actions"])
-        self.alg.init_storage(
-            self.env.num_envs,
-            self.num_steps_per_env,
-            actor_obs_shape=[num_actor_obs],
-            critic_obs_shape=[num_critic_obs],
-            action_shape=[num_actions],
-        )
+    def sim_and_log_states(self, states_to_log_dict, it_idx):
+        # Simulate environment for as many steps as expected in the dict.
+        # Log states to the dict, as well as whether the env terminated.
+        steps = states_to_log_dict["terminated"].shape[2]
+        actor_obs = self.get_obs(self.policy_cfg["actor_obs"])
+
+        with torch.inference_mode():
+            for i in range(steps):
+                actions = self.alg.act(actor_obs)
+                self.set_actions(
+                    self.policy_cfg["actions"],
+                    actions,
+                    self.policy_cfg["disable_actions"],
+                )
+
+                self.env.step()
+
+                actor_obs = self.get_noisy_obs(
+                    self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
+                )
+
+                # Log states (just for the first env)
+                terminated = self.get_terminated()[0]
+                for state in states_to_log_dict:
+                    if state == "terminated":
+                        states_to_log_dict[state][0, it_idx, i, :] = terminated
+                    else:
+                        states_to_log_dict[state][0, it_idx, i, :] = getattr(
+                            self.env, state
+                        )[0, :]
diff --git a/learning/runners/psd_sac_runner.py b/learning/runners/psd_sac_runner.py
new file mode 100644
index 00000000..31afda18
--- /dev/null
+++ b/learning/runners/psd_sac_runner.py
@@ -0,0 +1,54 @@
+from learning.utils import Logger
+
+from learning.runners import OffPolicyRunner
+from learning.modules import Critic, ChimeraActor  # noqa F401
+from learning.modules.QRCritics import *  # noqa F401
+
+from learning.storage import ReplayBuffer
+from learning.algorithms import SAC
+
+
+logger = Logger()
+storage = ReplayBuffer()
+
+
+class PSACRunner(OffPolicyRunner):
+    def __init__(self, env, train_cfg, device="cpu"):
+        super().__init__(env, train_cfg, device)
+        logger.initialize(
+            self.env.num_envs,
+            self.env.dt,
+            self.cfg["max_iterations"],
+            self.device,
+        )
+
+    def _set_up_alg(self):
+        num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
+        num_actions = self.get_action_size(self.actor_cfg["actions"])
+        num_critic_in = self.get_obs_size(self.critic_cfg["obs"]) + num_actions  # noqa: F841
+        actor = ChimeraActor(num_actor_obs, num_actions, **self.actor_cfg)
+
+        critic_class = self.critic_cfg["critic_class_name"]
+        critic_1 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)")
+        critic_2 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)")
+        target_critic_1 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)")
+        target_critic_2 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)")
+
+        # critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+        # critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+
+        # target_critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+        # target_critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+
+        print(actor)
+        print(critic_1)
+
+        self.alg = SAC(
+            actor,
+            critic_1,
+            critic_2,
+            target_critic_1,
+            target_critic_2,
+            device=self.device,
+            **self.alg_cfg,
+        )
diff --git a/learning/storage/SE_storage.py b/learning/storage/SE_storage.py
index a63f5f9f..0f4eaf41 100644
--- a/learning/storage/SE_storage.py
+++ b/learning/storage/SE_storage.py
@@ -53,9 +53,9 @@ def clear(self):
     def mini_batch_generator(self, num_mini_batches, num_epochs=8):
         """Generate mini batch for learning"""
         batch_size = self.num_envs * self.num_transitions_per_env
-        mini_batch_size = batch_size // num_mini_batches
+        batch_size = batch_size // num_mini_batches
         indices = torch.randperm(
-            num_mini_batches * mini_batch_size,
+            num_mini_batches * batch_size,
             requires_grad=False,
             device=self.device,
         )
@@ -69,8 +69,8 @@ def mini_batch_generator(self, num_mini_batches, num_epochs=8):
 
         for epoch in range(num_epochs):
             for i in range(num_mini_batches):
-                start = i * mini_batch_size
-                end = (i + 1) * mini_batch_size
+                start = i * batch_size
+                end = (i + 1) * batch_size
                 batch_idx = indices[start:end]
 
                 obs_batch = observations[batch_idx]
diff --git a/learning/storage/__init__.py b/learning/storage/__init__.py
index edd7bd55..47f2593b 100644
--- a/learning/storage/__init__.py
+++ b/learning/storage/__init__.py
@@ -3,3 +3,5 @@
 
 from .rollout_storage import RolloutStorage
 from .SE_storage import SERolloutStorage
+from .dict_storage import DictStorage
+from .replay_buffer import ReplayBuffer
\ No newline at end of file
diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py
new file mode 100644
index 00000000..2191093c
--- /dev/null
+++ b/learning/storage/dict_storage.py
@@ -0,0 +1,48 @@
+import torch
+from tensordict import TensorDict
+
+
+class DictStorage:
+    def __init__(self):
+        self.initialized = False
+
+    def initialize(
+        self,
+        dummy_dict,
+        num_envs=2**12,
+        max_storage=2**17,
+        device="cpu",
+    ):
+        self.device = device
+        self.num_envs = num_envs
+        max_length = max_storage // num_envs
+        self.max_length = max_length
+        self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device)
+        self.fill_count = 0
+
+        for key in dummy_dict.keys():
+            if dummy_dict[key].dim() == 1:  # if scalar
+                self.data[key] = torch.zeros(
+                    (max_length, num_envs),
+                    dtype=dummy_dict[key].dtype,
+                    device=self.device,
+                )
+            else:
+                self.data[key] = torch.zeros(
+                    (max_length, num_envs, dummy_dict[key].shape[1]),
+                    dtype=dummy_dict[key].dtype,
+                    device=self.device,
+                )
+
+    @torch.inference_mode
+    def add_transitions(self, transition: TensorDict):
+        if self.fill_count >= self.max_length:
+            raise AssertionError("Rollout buffer overflow")
+        self.data[self.fill_count] = transition
+        self.fill_count += 1
+
+    def clear(self):
+        self.fill_count = 0
+        with torch.inference_mode():
+            for tensor in self.data:
+                tensor.zero_()
diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py
new file mode 100644
index 00000000..350a2c25
--- /dev/null
+++ b/learning/storage/replay_buffer.py
@@ -0,0 +1,54 @@
+import torch
+from tensordict import TensorDict
+
+
+class ReplayBuffer:
+    def __init__(self):
+        self.initialized = False
+
+    def initialize(
+        self,
+        dummy_dict,
+        num_envs=2**12,
+        max_storage=2**17,
+        device="cpu",
+    ):
+        self.device = device
+        self.num_envs = num_envs
+        max_length = max_storage // num_envs
+        self.max_length = max_length
+        self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device)
+        self.fill_count = 0
+        self.add_index = 0
+
+        for key in dummy_dict.keys():
+            if dummy_dict[key].dim() == 1:  # if scalar
+                self.data[key] = torch.zeros(
+                    (max_length, num_envs),
+                    dtype=dummy_dict[key].dtype,
+                    device=self.device,
+                )
+            else:
+                self.data[key] = torch.zeros(
+                    (max_length, num_envs, dummy_dict[key].shape[1]),
+                    dtype=dummy_dict[key].dtype,
+                    device=self.device,
+                )
+
+    @torch.inference_mode
+    def add_transitions(self, transition: TensorDict):
+        if self.fill_count >= self.max_length and self.add_index >= self.max_length:
+            self.add_index = 0
+        self.data[self.add_index] = transition
+        self.fill_count += 1
+        self.add_index += 1
+
+    def get_data(self):
+        return self.data[: min(self.fill_count, self.max_length), :]
+
+    def clear(self):
+        self.fill_count = 0
+        self.add_index = 0
+        with torch.inference_mode():
+            for tensor in self.data:
+                tensor.zero_()
diff --git a/learning/storage/rollout_storage.py b/learning/storage/rollout_storage.py
index bba48e02..0f2885a3 100644
--- a/learning/storage/rollout_storage.py
+++ b/learning/storage/rollout_storage.py
@@ -182,11 +182,11 @@ def get_statistics(self):
         trajectory_lengths = done_indices[1:] - done_indices[:-1]
         return trajectory_lengths.float().mean(), self.rewards.mean()
 
-    def mini_batch_generator(self, num_mini_batches, num_epochs=8):
+    def mini_batch_generator(self, num_mini_batches=1, num_epochs=8):
         batch_size = self.num_envs * self.num_transitions_per_env
-        mini_batch_size = batch_size // num_mini_batches
+        batch_size = batch_size // num_mini_batches
         indices = torch.randperm(
-            num_mini_batches * mini_batch_size,
+            num_mini_batches * batch_size,
             requires_grad=False,
             device=self.device,
         )
@@ -207,8 +207,8 @@ def mini_batch_generator(self, num_mini_batches, num_epochs=8):
 
         for epoch in range(num_epochs):
             for i in range(num_mini_batches):
-                start = i * mini_batch_size
-                end = (i + 1) * mini_batch_size
+                start = i * batch_size
+                end = (i + 1) * batch_size
                 batch_idx = indices[start:end]
 
                 obs_batch = observations[batch_idx]
diff --git a/learning/utils/PBRS/test_usecase.py b/learning/utils/PBRS/test_usecase.py
index a84ef547..be5d0412 100644
--- a/learning/utils/PBRS/test_usecase.py
+++ b/learning/utils/PBRS/test_usecase.py
@@ -30,14 +30,14 @@ def test_PBRS():
     logger.initialize(num_envs=1, episode_dt=0.1, total_iterations=100, device="cpu")
     self_env = MyTask()
 
-    # let's create a dummy policy_cfg with just the reward weights
-    policy_cfg = {"reward": {"weights": {"dummy": 1}}}
-    policy_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2}
+    # let's create a dummy critic_cfg with just the reward weights
+    critic_cfg = {"reward": {"weights": {"dummy": 1}}}
+    critic_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2}
 
     # and a dummy env
 
     PBRS = PotentialBasedRewardShaping(
-        policy_cfg["reward"]["pbrs_weights"], device="cpu"
+        critic_cfg["reward"]["pbrs_weights"], device="cpu"
     )
     assert PBRS.get_reward_keys() == ["PBRS_first", "PBRS_second"]
 
diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py
index a39a3daf..b39dc16d 100644
--- a/learning/utils/__init__.py
+++ b/learning/utils/__init__.py
@@ -1,40 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGE.
-#
-# Copyright (c) 2021 ETH Zurich, Nikita Rudin
-
 from .utils import (
-    split_and_pad_trajectories,
-    unpad_trajectories,
     remove_zero_weighted_rewards,
     set_discount_from_horizon,
+    polyak_update
 )
+from .dict_utils import *
 from .logger import Logger
 from .PBRS.PotentialBasedRewardShaping import PotentialBasedRewardShaping
\ No newline at end of file
diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py
new file mode 100644
index 00000000..73039526
--- /dev/null
+++ b/learning/utils/dict_utils.py
@@ -0,0 +1,96 @@
+import numpy as np
+import torch
+from tensordict import TensorDict
+
+
+@torch.no_grad
+def compute_MC_returns(data: TensorDict, gamma, critic=None):
+    # todo not as accurate as taking
+    if critic is None:
+        last_values = torch.zeros_like(data["rewards"][0])
+    else:
+        last_values = critic.evaluate(data["critic_obs"][-1])
+
+    returns = torch.zeros_like(data["rewards"])
+    returns[-1] = data["rewards"][-1] + gamma * last_values * ~data["terminated"][-1]
+    for k in reversed(range(data["rewards"].shape[0] - 1)):
+        not_done = ~data["dones"][k]
+        returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done
+        if critic is not None:
+            returns[k] += (
+                gamma
+                * critic.evaluate(data["critic_obs"][k])
+                * data["timed_out"][k]
+                * ~data["terminated"][k]
+            )
+    return returns
+
+
+@torch.no_grad
+def normalize(input, eps=1e-8):
+    return (input - input.mean()) / (input.std() + eps)
+
+
+@torch.no_grad
+def compute_generalized_advantages(data, gamma, lam, critic):
+    last_values = critic.evaluate(data["next_critic_obs"][-1])
+    advantages = torch.zeros_like(data["values"])
+    if last_values is not None:
+        # todo check this
+        not_done = ~data["dones"][-1]
+        advantages[-1] = (
+            data["rewards"][-1]
+            + gamma * data["values"][-1] * data["timed_out"][-1]
+            + gamma * last_values * not_done
+            - data["values"][-1]
+        )
+
+    for k in reversed(range(data["values"].shape[0] - 1)):
+        not_done = ~data["dones"][k]
+        td_error = (
+            data["rewards"][k]
+            + gamma * data["values"][k] * data["timed_out"][k]
+            + gamma * data["values"][k + 1] * not_done
+            - data["values"][k]
+        )
+        advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1]
+
+    return advantages
+
+
+# todo change num_epochs to num_batches
+@torch.no_grad
+def create_uniform_generator(data, batch_size, max_gradient_steps=100):
+    n, m = data.shape
+    total_data = n * m
+
+    if batch_size > total_data:
+        batch_size = total_data
+
+    num_batches_per_epoch = total_data // batch_size
+    num_epochs = max_gradient_steps // num_batches_per_epoch
+    last_epoch_batches = max_gradient_steps - num_epochs * num_batches_per_epoch
+
+    for epoch in range(num_epochs):
+        indices = torch.randperm(total_data, device=data.device)
+        for i in range(num_batches_per_epoch):
+            batched_data = data.flatten(0, 1)[
+                indices[i * batch_size : (i + 1) * batch_size]
+            ]
+            yield batched_data
+    else:
+        indices = torch.randperm(total_data, device=data.device)
+        for i in range(last_epoch_batches):
+            batched_data = data.flatten(0, 1)[
+                indices[i * batch_size : (i + 1) * batch_size]
+            ]
+            yield batched_data
+
+
+@torch.no_grad
+def export_to_numpy(data, path):
+    # check if path ends iwth ".npz", and if not append it.
+    if not path.endswith(".npz"):
+        path += ".npz"
+    np.savez_compressed(path, **{key: val.cpu().numpy() for key, val in data.items()})
+    return
diff --git a/learning/utils/utils.py b/learning/utils/utils.py
index b1a20bcf..d4eac980 100644
--- a/learning/utils/utils.py
+++ b/learning/utils/utils.py
@@ -1,96 +1,3 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGE.
-#
-# Copyright (c) 2021 ETH Zurich, Nikita Rudin
-
-import torch
-
-
-def split_and_pad_trajectories(tensor, dones):
-    """
-    Splits trajectories at done indices. Then concatenates them and pads
-    with zeros up to the length og the longest trajectory.
-    Returns masks corresponding to valid parts of the trajectories
-    Example:
-        Input: [ [a1, a2, a3, a4 | a5, a6],
-                 [b1, b2 | b3, b4, b5 | b6]
-                ]
-
-        Output:[ [a1, a2, a3, a4], | [  [True, True, True, True],
-                 [a5, a6, 0, 0],   |    [True, True, False, False],
-                 [b1, b2, 0, 0],   |    [True, True, False, False],
-                 [b3, b4, b5, 0],  |    [True, True, True, False],
-                 [b6, 0, 0, 0]     |    [True, False, False, False],
-                ]                  | ]
-
-    Assumes that the inputy has the following dimension order:
-    [time, number of envs, aditional dimensions]
-    """
-    dones = dones.clone()
-    dones[-1] = 1
-    # * Permute the buffers to have order
-    # * (num_envs, num_transitions_per_env, ...), for correct reshaping
-    flat_dones = dones.transpose(1, 0).reshape(-1, 1)
-
-    # * Get length of trajectory by counting the number of
-    # * successive not done elements
-    done_indices = torch.cat(
-        (
-            flat_dones.new_tensor([-1], dtype=torch.int64),
-            flat_dones.nonzero()[:, 0],
-        )
-    )
-    trajectory_lengths = done_indices[1:] - done_indices[:-1]
-    trajectory_lengths_list = trajectory_lengths.tolist()
-    # * Extract the individual trajectories
-    trajectories = torch.split(
-        tensor.transpose(1, 0).flatten(0, 1), trajectory_lengths_list
-    )
-    padded_trajectories = torch.nn.utils.rnn.pad_sequence(trajectories)
-
-    trajectory_masks = trajectory_lengths > torch.arange(
-        0, tensor.shape[0], device=tensor.device
-    ).unsqueeze(1)
-    return padded_trajectories, trajectory_masks
-
-
-def unpad_trajectories(trajectories, masks):
-    """Does the inverse operation of  split_and_pad_trajectories()"""
-    # * Need to transpose before and after the masking to have proper reshaping
-    return (
-        trajectories.transpose(1, 0)[masks.transpose(1, 0)]
-        .view(-1, trajectories.shape[0], trajectories.shape[-1])
-        .transpose(1, 0)
-    )
-
-
 def remove_zero_weighted_rewards(reward_weights):
     for name in list(reward_weights.keys()):
         if reward_weights[name] == 0:
@@ -110,3 +17,9 @@ def set_discount_from_horizon(dt, horizon):
         discount_factor = 1 - 1 / discrete_time_horizon
 
     return discount_factor
+
+
+def polyak_update(online, target, polyak_factor):
+    for op, tp in zip(online.parameters(), target.parameters()):
+        tp.data.copy_((1.0 - polyak_factor) * op.data + polyak_factor * tp.data)
+    return target
diff --git a/pyproject.toml b/pyproject.toml
index a1906be6..131f359f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,8 +40,6 @@ target-version = "py38"
 # McCabe complexity (`C901`) by default.
 ignore = ["E722"]
 
-ignore-init-module-imports = true
-
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 
diff --git a/requirements.txt b/requirements.txt
index e2a6fe2f..923df0e8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,6 @@ mss
 ruff
 pre-commit
 onnx
-onnxruntime
\ No newline at end of file
+onnxruntime
+tensordict
+optuna
\ No newline at end of file
diff --git a/resources/robots/mit_humanoid/friction_model_L.pt b/resources/robots/mit_humanoid/friction_model_L.pt
new file mode 100644
index 00000000..d1d680d5
Binary files /dev/null and b/resources/robots/mit_humanoid/friction_model_L.pt differ
diff --git a/resources/robots/mit_humanoid/friction_model_R.pt b/resources/robots/mit_humanoid/friction_model_R.pt
new file mode 100644
index 00000000..ce831fc8
Binary files /dev/null and b/resources/robots/mit_humanoid/friction_model_R.pt differ
diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf
index 51a84eee..f7f55186 100644
--- a/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf
+++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf
@@ -562,7 +562,7 @@ Simple Foot: foot approximated as single box-contact -->
   </link>
   <joint
     name="15_left_shoulder_pitch"
-    type="revolute">
+    type="fixed">
     <origin
       xyz="0.01911 0.17608 0.30392"
       rpy="0 0 0" />
@@ -607,7 +607,7 @@ Simple Foot: foot approximated as single box-contact -->
   </link>
   <joint
     name="16_left_shoulder_abad"
-    type="revolute">
+    type="fixed">
     <origin
       xyz="0 .05760 0"
       rpy="0.0 0 0" />
@@ -660,7 +660,7 @@ Simple Foot: foot approximated as single box-contact -->
   </link>
   <joint
     name="17_left_shoulder_yaw"
-    type="revolute">
+    type="fixed">
     <origin
       xyz="0 0 -.10250"
       rpy="0.0 0 0" />
@@ -711,7 +711,7 @@ Simple Foot: foot approximated as single box-contact -->
   </link>
   <joint
     name="18_left_elbow"
-    type="revolute">
+    type="fixed">
     <origin
       xyz="0 0 -.15750"
       rpy="0 0 0.0" />
@@ -790,7 +790,7 @@ Simple Foot: foot approximated as single box-contact -->
   </link>
   <joint
     name="11_right_shoulder_pitch"
-    type="revolute">
+    type="fixed">
     <origin
       xyz="0.01911 -0.17608 0.30392"
       rpy="0 0 0" />
@@ -835,7 +835,7 @@ Simple Foot: foot approximated as single box-contact -->
   </link>
   <joint
     name="12_right_shoulder_abad"
-    type="revolute">
+    type="fixed">
     <origin
       xyz="0 -.05760 0"
       rpy="0.0 0 0" />
@@ -888,7 +888,7 @@ Simple Foot: foot approximated as single box-contact -->
   </link>
   <joint
     name="13_right_shoulder_yaw"
-    type="revolute">
+    type="fixed">
     <origin
       xyz="0 0 -.10250"
       rpy="0.0 0 0" />
@@ -939,7 +939,7 @@ Simple Foot: foot approximated as single box-contact -->
   </link>
   <joint
     name="14_right_elbow"
-    type="revolute">
+    type="fixed">
     <origin
       xyz="0 0 -.15750"
       rpy="0 0 0.0" />
diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf
new file mode 100644
index 00000000..98a0773a
--- /dev/null
+++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf
@@ -0,0 +1,996 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<!-- URDF for MIT Humanoid
+Version Specifics: "Full" and "Simple Foot"
+Full: collision geometries for all links
+Simple Foot: foot approximated as single box-contact -->
+
+<robot
+  name="humanoid">
+  <link
+    name="base">
+    <inertial>
+      <origin
+        xyz="0.009598 0.000935 0.151714"
+        rpy="0 0 0" />
+      <mass value="7.954054" />
+      <inertia
+        ixx="0.168459"
+        ixy="0.000124"
+        ixz="0.006493"
+        iyy="0.101358"
+        iyz="0.000278"
+        izz="0.091754" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="-0.00565 0 -0.05735"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/torso.stl" scale="0.001 0.001 0.001" />
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.752941176470588 0.752941176470588 0.752941176470588 1" />
+      </material>
+    </visual>
+     <collision>
+            <origin rpy="0 0 0" xyz="0.023 0. 0.08"/>
+            <geometry>
+                <box size="0.15 0.3 0.35"/>
+            </geometry>
+    </collision>
+  </link>
+  <link
+    name="left_hip_yaw">
+    <inertial>
+      <origin
+        xyz="-0.065677489200101  -0.001868461810995  -0.063608863694661"
+        rpy="0 0 0" />
+      <mass value="1.03610198878872" />
+      <inertia
+        ixx="0.00567730565299"
+        ixy="-0.000123286844996"
+        ixz="-0.003803404981833"
+        iyy="0.01000676842184"
+        iyz="-0.000123074285517"
+        izz="0.005622832879212" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_hip_yaw.stl" scale="0.001 0.001 0.001" />
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+  </link>
+  <joint
+    name="06_left_hip_yaw"
+    type="revolute">
+    <origin
+      xyz="-0.00565 0.082 -0.05735"
+      rpy="0 -0.174533 0" />
+    <parent
+      link="base" />
+    <child
+      link="left_hip_yaw" />
+    <axis
+      xyz="0 0 -1" />
+    <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+  <link
+    name="left_hip_abad">
+    <inertial>
+      <origin
+        xyz="0.058112835965384 -0.013505242569335 0.014569589589137"
+        rpy="0 0 0" />
+      <mass value="1.10845642066563" />
+      <inertia
+        ixx="0.001946765858581"
+        ixy="0.000875106562403"
+        ixz="-0.000401465299746"
+        iyy="0.005349108237236"
+        iyz="0.000216871363655"
+        izz="0.005114615354879" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_hip_abad.stl" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+  </link>
+  <joint
+    name="07_left_hip_abad"
+    type="revolute">
+    <origin
+      xyz="-0.06435 0 -.07499"
+      rpy="0.0 0.436332 0.0"/>
+    <parent
+      link="left_hip_yaw" />
+    <child
+      link="left_hip_abad" />
+    <axis
+      xyz="1 0 0" />
+          <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+ <link 
+    name="left_upper_leg">
+    <inertial>
+      <origin
+        xyz="0.004372713492862  0.025610933851324 -0.09370979295432"
+        rpy="0 0 0" />
+      <mass value="2.9466623694791" />
+      <inertia
+        ixx="0.056092231067477"
+        ixy="-0.000490336872141"
+        ixz="0.004048303589526"
+        iyy="0.055684035451577"
+        iyz="0.009711050120379"
+        izz="0.005614311941973" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0.01283 0 -0.0174"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_leg_upper.stl" scale="0.001 0.001 0.001" />
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <origin rpy="0 0 0" xyz="0.0 0.0 -0.16"/>
+        <geometry>
+            <cylinder radius="0.035" length="0.16"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="08_left_hip_pitch"
+    type="revolute">
+    <origin
+      xyz="0.07133 -0.002 0.0"
+      rpy="0 -0.261799 0" />
+    <parent
+      link="left_hip_abad" />
+    <child
+      link="left_upper_leg" />
+    <axis
+      xyz="0 1 0" />
+          <limit effort="72" lower="-3" upper="3" velocity="40"/>
+  </joint>
+
+  <link
+    name="left_lower_leg">
+    <inertial>
+      <origin
+        xyz="-0.014401204856919 -0.001696539969774 0.027394674089734"
+        rpy="0 0 0" />
+      <mass value="0.238357586915263" />
+      <inertia
+        ixx="0.010975524578483"
+        ixy="0.000008"
+        ixz="0.000491443762571"
+        iyy="0.011034922309191"
+        iyz="0.00009"
+        izz="0.000294128826005" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_leg_lower.stl" scale="0.001 0.001 0.001" />
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <origin rpy="0 0 0" xyz="0.0 0.0 -0.15"/>
+        <geometry>
+            <cylinder radius="0.05" length="0.13"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="09_left_knee"
+    type="revolute">
+    <origin
+      xyz="0.0 0.0 -0.2666"
+      rpy="0 0 0" />
+    <parent
+      link="left_upper_leg" />
+    <child
+      link="left_lower_leg" />
+    <axis
+      xyz="0 1 0" />
+          <limit effort="144" lower="0." upper="3." velocity="20"/>
+  </joint>
+
+   <link
+    name="left_foot">
+    <inertial>
+      <origin
+        xyz="0.003235539757018 -0.000321708855445 -0.024835658864996"
+        rpy="0 0 0" />
+      <mass value="0.434332672008075" />
+      <inertia
+        ixx="0.000370048860637"
+        ixy="0.000000361777951684732"
+        ixz="-0.000118105634294"
+        iyy="0.002398432542765"
+        iyz="-0.00000349674373383071"
+        izz="0.002083498286479" />
+    </inertial>
+    <!-- Foot inertia Iyy default: 0.000806
+    Modified to account for reflected motor inertia -->
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_foot.stl" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.63671875 0.12109375 0.752941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <!-- <origin rpy="1.57079 0 0" xyz="0.03 0.0 -0.03"/>
+        <geometry>
+            <box size="0.20 0.02 0.04"/>
+        </geometry> -->
+        <origin rpy="0 1.57079 0" xyz="0.03 0.0 -0.03"/>
+        <geometry>
+            <cylinder length="0.15" radius="0.02"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="10_left_ankle"
+    type="revolute">
+    <origin
+      xyz="0.0 0.0 -0.2785"
+      rpy="0 0 0" />
+    <parent
+      link="left_lower_leg" />
+    <child
+      link="left_foot" />
+    <axis
+      xyz="0 1 0" />
+          <limit effort="68" lower="-0.97" upper="0.97" velocity="24"/>
+  </joint>
+ <link
+    name="right_hip_yaw">
+    <inertial>
+      <origin
+        xyz="-0.065184991281157 0.001391305332592 -0.064121979944007"
+        rpy="0 0 0" />
+      <mass value="0.944770728939119" />
+      <inertia
+        ixx="0.005357052181805"
+        ixy="0.0000818747123"
+        ixz="-0.003427894976548"
+        iyy="0.00923535702765"
+        iyz="0.000084203916519"
+        izz="0.005156514191139" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_hip_yaw.stl" scale="0.001 -0.001 0.001" />
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+  </link>
+  <joint
+    name="01_right_hip_yaw"
+    type="revolute">
+    <origin
+      xyz="-0.00565 -0.082 -0.05735"
+      rpy="0 -0.174533 0" />
+    <parent
+      link="base" />
+    <child
+      link="right_hip_yaw" />
+    <axis
+      xyz="0 0 1" />
+    <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+  
+  <link
+    name="right_hip_abad">
+    <inertial>
+      <origin
+        xyz="0.052141443908008  0.013947510011721  0.024209980877026"
+        rpy="0 0 0" />
+      <mass value="2.22134334871633" />
+      <inertia
+        ixx="0.003252746154843"
+        ixy="-0.001623511116749"
+        ixz="-0.002263316469026"
+        iyy="0.008719830483193"
+        iyz="-0.000745379387761"
+        izz="0.007646079982067" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_hip_abad.stl" scale="0.001 -0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+  </link>
+  <joint
+    name="02_right_hip_abad"
+    type="revolute">
+    <origin
+      xyz="-0.06435 0 -.07499"
+      rpy="0.0 0.436332 0.0"/>
+    <parent
+      link="right_hip_yaw" />
+    <child
+      link="right_hip_abad" />
+    <axis
+      xyz="-1 0 0" />
+          <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+  
+ <link
+    name="right_upper_leg">
+    <inertial>
+      <origin
+        xyz="0.006755522658048 -0.024808560969254 -0.091196015308728"
+        rpy="0 0 0" />
+      <mass value="3.20991362323237" />
+      <inertia
+        ixx="0.054987344909268"
+        ixy="0.000849211308416"
+        ixz="0.006853857315879"
+        iyy="0.055272867443404"
+        iyz="-0.009594753352539"
+        izz="0.006327830944097" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0.01283 0 -0.0174"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_leg_upper.stl" scale="0.001 -0.001 0.001" />
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <origin rpy="0 0 0" xyz="0.0 0.0 -0.16"/>
+        <geometry>
+            <cylinder radius="0.035" length="0.16"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="03_right_hip_pitch"
+    type="revolute">
+    <origin
+      xyz="0.07133 0.002 0.0"
+      rpy="0 -0.261799 0" />
+    <parent
+      link="right_hip_abad" />
+    <child
+      link="right_upper_leg" />
+    <axis
+      xyz="0 1 0" />
+          <limit effort="72" lower="-3" upper="3" velocity="40"/>
+  </joint>
+  
+  <link
+    name="right_lower_leg">
+    <inertial>
+      <origin
+        xyz="-0.021181283533562 0.001390129997816 0.007356617786071"
+        rpy="0 0 0" />
+      <mass value="0.224587667752391" />
+      <inertia
+        ixx="0.010215815580851"
+        ixy="0.000006180468"
+        ixz="0.000362813548474"
+        iyy="0.010326014694671"
+        iyz="-0.0000402996255"
+        izz="0.000352688938011" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_leg_lower.stl" scale="0.001 -0.001 0.001" />
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <origin rpy="0 0 0" xyz="0.0 0.0 -0.15"/>
+        <geometry>
+            <cylinder radius="0.05" length="0.13"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="04_right_knee"
+    type="revolute">
+    <origin
+      xyz="0.0 0.0 -0.2666"
+      rpy="0 0 0" />
+    <parent
+      link="right_upper_leg" />
+    <child
+      link="right_lower_leg" />
+    <axis
+      xyz="0 1 0" />
+          <limit effort="144" lower="0." upper="3.0" velocity="20"/>
+  </joint>
+  
+   <link
+    name="right_foot">
+    <inertial>
+      <origin
+        xyz="0.008967719334277 0.00070566689364 -0.023317389576155"
+        rpy="0 0 0" />
+      <mass value="0.48135110558567" />
+      <inertia
+        ixx="0.000362930457918"
+        ixy="-0.00000178237"
+        ixz="-0.000047781"
+        iyy="0.002320413729313"
+        iyz="0.00000826302"
+        izz="0.002012953781741" />
+    </inertial>
+    <!-- Foot inertia Iyy default: 0.000806
+    Modified to account for reflected motor inertia -->
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_foot.stl" scale="0.001 -0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <!-- <origin rpy="1.57079 0 0" xyz="0.03 0.0 -0.03"/>
+        <geometry>
+            <box size="0.20 0.02 0.04"/>
+        </geometry> -->
+        <origin rpy="0 1.57079 0" xyz="0.03 0.0 -0.03"/>
+        <geometry>
+          <cylinder length="0.15" radius="0.02"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="05_right_ankle"
+    type="revolute">
+    <origin
+      xyz="0.0 0.0 -0.2785"
+      rpy="0 0 0" />
+    <parent
+      link="right_lower_leg" />
+    <child
+      link="right_foot" />
+    <axis
+      xyz="0 1 0" />
+          <limit effort="68" lower="-0.97" upper="0.97" velocity="24"/>
+  </joint>
+  <!-- Left Arm -->
+  <link
+    name="left_shoulder">
+    <inertial>
+      <origin
+        xyz="0.009518 0.053027 0.000060"
+        rpy="0 0 0" />
+      <mass value="0.788506" />
+      <inertia
+        ixx="0.001363 "
+        ixy="0.000024 "
+        ixz=" -0.000002 "
+        iyy="0.000759 "
+        iyz="  0.000001"
+        izz="  0.000866" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_shoulder1.stl" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+  </link>
+  <joint
+    name="15_left_shoulder_pitch"
+    type="revolute">
+    <origin
+      xyz="0.01346 0.17608 0.24657"
+      rpy="0 0 0" />
+    <parent
+      link="base" />
+    <child
+      link="left_shoulder" />
+    <axis
+      xyz="0 1 0" />
+          <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+  
+  <link
+    name="left_shoulder_2">
+    <inertial>
+      <origin
+        xyz="0.000117 0.000122 -0.082758"
+        rpy="0 0 0" />
+      <mass value="0.801249" />
+      <inertia
+        ixx=" 0.001136 "
+        ixy="0.000001 "
+        ixz="0.000016 "
+        iyy="0.001175 "
+        iyz=" 0.000001 "
+        izz="0.001241 " />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_shoulder2.stl" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+  </link>
+  <joint
+    name="16_left_shoulder_abad"
+    type="revolute">
+    <origin
+      xyz="0 .05760 0"
+      rpy="0.0 0 0" />
+    <parent
+      link="left_shoulder" />
+    <child
+      link="left_shoulder_2" />
+    <axis
+      xyz="1 0 0" />
+        <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+  <!--  -->
+  
+  <link
+    name="left_upper_arm">
+    <inertial>
+      <origin
+        xyz="-0.000073 -0.016471  -0.063210"
+        rpy="0 0 0" />
+      <mass value="0.905588" />
+      <inertia
+        ixx=" 0.001353 "
+        ixy="0.000002 "
+        ixz="0.000001 "
+        iyy="0.001804 "
+        iyz="-0.000061 "
+        izz="0.000796 " />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_shoulder3.stl" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <origin rpy="0 0 0" xyz="0 -0.01 -0.035"/>
+        <geometry>
+            <!-- <box size="0.1 0.085 0.15"/> -->
+            <cylinder radius="0.05" length="0.15"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="17_left_shoulder_yaw"
+    type="revolute">
+    <origin
+      xyz="0 0 -.10250"
+      rpy="0.0 0 0" />
+    <parent
+      link="left_shoulder_2" />
+    <child
+      link="left_upper_arm" />
+    <axis
+      xyz="0 0 -1" />
+        <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+
+  <link
+    name="left_lower_arm">
+    <inertial>
+      <origin
+        xyz="-0.007103 -0.000073 -0.099203"
+        rpy="0 0 0" />
+      <mass value="0.348390" />
+      <inertia
+        ixx="0.003400 "
+        ixy="0.0 "
+        ixz="-0.000049 "
+        iyy="0.003453 "
+        iyz="-0.000002 "
+        izz="0.000078 " />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_forearm.stl" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <origin rpy="0 0 0" xyz="-0.01 0 -0.15"/>
+        <geometry>
+            <cylinder radius="0.025" length="0.15"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="18_left_elbow"
+    type="revolute">
+    <origin
+      xyz="0 0 -.15750"
+      rpy="0 0 0.0" />
+    <parent
+      link="left_upper_arm" />
+    <child
+      link="left_lower_arm" />
+    <axis
+      xyz="0 1 0" />
+        <limit effort="55" lower="-3" upper="3" velocity="30"/>
+  </joint>
+  <link
+    name="left_hand">
+    <inertial>
+      <origin
+        xyz="0.0 -0.0 -0.27"
+        rpy="0 0 0" />
+      <mass value="0.001" />
+      <inertia
+        ixx="0.0"
+        ixy="0.0"
+        ixz="0.0"
+        iyy="0.0"
+        iyz="0.0"
+        izz="0.0" />
+    </inertial>
+      <collision>
+        <origin rpy="1.57079 0 0" xyz="0.0 0.0 0."/>
+        <geometry>
+            <cylinder radius="0.025" length="0.015"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="left_hand_link"
+    type="fixed">
+    <origin
+      xyz="0 0 -0.27"
+      rpy="0 0 0" />
+    <parent
+      link="left_lower_arm" />
+    <child
+      link="left_hand" />
+  </joint>
+
+  <!-- Right Arm -->
+  <link
+    name="right_shoulder">
+    <inertial>
+      <origin
+        xyz="0.009518 -0.053027 0.000060"
+        rpy="0 0 0" />
+      <mass value="0.788506" />
+      <inertia
+        ixx="0.001363 "
+        ixy="-0.000024 "
+        ixz=" 0.000002 "
+        iyy="0.000759 "
+        iyz="  0.000001"
+        izz="  0.000866" />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_shoulder1.stl" scale="0.001 -0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+  </link>
+  <joint
+    name="11_right_shoulder_pitch"
+    type="revolute">
+    <origin
+      xyz="0.01346 -0.17608 0.24657"
+      rpy="0 0 0" />
+    <parent
+      link="base" />
+    <child
+      link="right_shoulder" />
+    <axis
+      xyz="0 1 0" />
+          <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+
+  <link
+    name="right_shoulder_2">
+    <inertial>
+      <origin
+        xyz="0.000117 -0.000122 -0.082758"
+        rpy="0 0 0" />
+      <mass value="0.801249" />
+      <inertia
+        ixx=" 0.001136 "
+        ixy="-0.000001 "
+        ixz="-0.000016 "
+        iyy="0.001175 "
+        iyz=" 0.000001 "
+        izz="0.001241 " />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_shoulder2.stl" scale="0.001 -0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+  </link>
+  <joint
+    name="12_right_shoulder_abad"
+    type="revolute">
+    <origin
+      xyz="0 -.05760 0"
+      rpy="0.0 0 0" />
+    <parent
+      link="right_shoulder" />
+    <child
+      link="right_shoulder_2" />
+    <axis
+      xyz="-1 0 0" />
+        <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+  <!--  -->
+
+  <link
+    name="right_upper_arm">
+    <inertial>
+      <origin
+        xyz="-0.000073 0.016471  -0.063210"
+        rpy="0 0 0" />
+      <mass value="0.905588" />
+      <inertia
+        ixx=" 0.001353 "
+        ixy="-0.000002 "
+        ixz="-0.000001 "
+        iyy="0.001804 "
+        iyz="-0.000061 "
+        izz="0.000796 " />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_shoulder3.stl" scale="0.001 -0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <origin rpy="0 0 0" xyz="0 -0.01 -0.035"/>
+        <geometry>
+            <!-- <box size="0.1 0.085 0.15"/> -->
+            <cylinder radius="0.05" length="0.15"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="13_right_shoulder_yaw"
+    type="revolute">
+    <origin
+      xyz="0 0 -.10250"
+      rpy="0.0 0 0" />
+    <parent
+      link="right_shoulder_2" />
+    <child
+      link="right_upper_arm" />
+    <axis
+      xyz="0 0 1" />
+        <limit effort="34" lower="-3" upper="3" velocity="48"/>
+  </joint>
+
+  <link
+    name="right_lower_arm">
+    <inertial>
+      <origin
+        xyz="-0.007103 -0.000073 -0.099203"
+        rpy="0 0 0" />
+      <mass value="0.348390" />
+      <inertia
+        ixx="0.003400 "
+        ixy="0.0 "
+        ixz="-0.000049 "
+        iyy="0.003453 "
+        iyz="-0.000002 "
+        izz="0.000078 " />
+    </inertial>
+    <visual>
+      <origin
+        xyz="0 0 0"
+        rpy="0 0 0" />
+      <geometry>
+        <mesh
+          filename="meshes_v3/left_forearm.stl" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material
+        name="">
+        <color
+          rgba="0.52941176470588 0.752941176470588 0.2941176470588 1" />
+      </material>
+    </visual>
+    <collision>
+        <origin rpy="0 0 0" xyz="-0.01 0 -0.15"/>
+        <geometry>
+            <cylinder radius="0.025" length="0.15"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="14_right_elbow"
+    type="revolute">
+    <origin
+      xyz="0 0 -.15750"
+      rpy="0 0 0.0" />
+    <parent
+      link="right_upper_arm" />
+    <child
+      link="right_lower_arm" />
+    <axis
+      xyz="0 1 0" />
+        <limit effort="55" lower="-3" upper="3" velocity="30"/>
+  </joint>
+    <link
+    name="right_hand">
+    <inertial>
+      <origin
+        xyz="0.0 -0.0 -0.27"
+        rpy="0 0 0" />
+      <mass value="0.001" />
+      <inertia
+        ixx="0.0"
+        ixy="0.0"
+        ixz="0.0"
+        iyy="0.0"
+        iyz="0.0"
+        izz="0.0" />
+    </inertial>
+      <collision>
+        <origin rpy="1.57079 0 0" xyz="0.0 0.0 0."/>
+        <geometry>
+            <cylinder radius="0.025" length="0.015"/>
+        </geometry>
+    </collision>
+  </link>
+  <joint
+    name="right_hand_link"
+    type="fixed">
+    <origin
+      xyz="0 0 -0.27"
+      rpy="0 0 0" />
+    <parent
+      link="right_lower_arm" />
+    <child
+      link="right_hand" />
+  </joint>
+
+</robot>
diff --git a/resources/robots/pendulum/urdf/pendulum.urdf b/resources/robots/pendulum/urdf/pendulum.urdf
new file mode 100644
index 00000000..90e894e4
--- /dev/null
+++ b/resources/robots/pendulum/urdf/pendulum.urdf
@@ -0,0 +1,56 @@
+<?xml version="1.0"?>
+<robot name="pendulum">
+
+    <link name="base">
+    <visual>
+      <geometry>
+        <box size="0.1 0.1 0.1"/>
+      </geometry>
+      <material name="base_mat">
+        <color rgba="0.9 0.6 0.2 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <box size="0.03 8 0.03"/>
+      </geometry>
+    </collision>
+  </link>
+
+
+  <link name="pole">
+    <visual>
+      <geometry>
+        <box size="0.06 0.04 0.75"/>
+      </geometry>
+      <origin xyz="0 0 0.47"/>
+      <material name="pole_mat">
+        <color rgba="0.1 0.1 0.3 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <box size="0.04 0.06 1.0"/>
+      </geometry>
+      <origin xyz="0 0 0.47"/>
+    </collision>
+    <inertial>
+      <mass value="1"/>
+      <origin xyz="0 0 1.0"/>
+      <inertia ixx="0.0267" ixy="0" ixz="0"
+               iyy="0.0267" iyz="0" izz="0.0001"/>
+    </inertial>
+  </link>
+
+  <joint name="theta" type="revolute">
+    <axis xyz="0 1 0"/>
+    <origin xyz="0 0 0"/>
+    <parent link="base"/>
+    <child link="pole"/>
+    <limit effort="5.0"/>
+    <limit velocity="100.0"/>
+     <!-- lower="-50" upper="50"/> -->
+
+  </joint>
+
+</robot>
diff --git a/scripts/play.py b/scripts/play.py
index 29a837ea..37882f7e 100644
--- a/scripts/play.py
+++ b/scripts/play.py
@@ -9,16 +9,16 @@
 
 def setup(args):
     env_cfg, train_cfg = task_registry.create_cfgs(args)
-    env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16)
+    env_cfg.env.num_envs = 32
     if hasattr(env_cfg, "push_robots"):
         env_cfg.push_robots.toggle = False
     if hasattr(env_cfg, "commands"):
         env_cfg.commands.resampling_time = 9999
-    env_cfg.env.episode_length_s = 9999
+    env_cfg.env.episode_length_s = 50
     env_cfg.env.num_projectiles = 20
     task_registry.make_gym_and_sim()
+    env_cfg.init_state.reset_mode = "reset_to_range"
     env = task_registry.make_env(args.task, env_cfg)
-    env.cfg.init_state.reset_mode = "reset_to_basic"
     train_cfg.runner.resume = True
     train_cfg.logging.enable_local_saving = False
     runner = task_registry.make_alg_runner(env, train_cfg)
@@ -46,9 +46,9 @@ def play(env, runner, train_cfg):
         if env.cfg.viewer.record:
             recorder.update(i)
         runner.set_actions(
-            runner.policy_cfg["actions"],
+            runner.actor_cfg["actions"],
             runner.get_inference_actions(),
-            runner.policy_cfg["disable_actions"],
+            runner.actor_cfg["disable_actions"],
         )
         env.step()
         env.check_exit()
diff --git a/scripts/train.py b/scripts/train.py
index dfa6afea..759992cf 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -13,7 +13,6 @@ def setup():
     wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args)
     env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
     randomize_episode_counters(env)
-
     policy_runner = task_registry.make_alg_runner(env, train_cfg)
 
     local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir)
diff --git a/setup.py b/setup.py
index c918d911..06709c4e 100644
--- a/setup.py
+++ b/setup.py
@@ -2,20 +2,16 @@
 from distutils.core import setup
 
 setup(
-    name="gpuGym",
-    version="1.0.1",
+    name="QGym",
+    version="1.0.3",
     author="Biomimetic Robotics Lab",
     license="BSD-3-Clause",
     packages=find_packages(),
-    description="Isaac Gym environments for Legged Robots",
+    description="Isaac Gym environments",
     install_requires=[
         "isaacgym",
-        "matplotlib",
-        "pandas",
-        "tensorboard",
         "setuptools==59.5.0",
         "torch>=1.4.0",
         "torchvision>=0.5.0",
-        "numpy>=1.16.4",
     ],
 )
diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py
index 4781a665..ce645b5c 100644
--- a/tests/integration_tests/test_runner_integration.py
+++ b/tests/integration_tests/test_runner_integration.py
@@ -7,7 +7,7 @@
 # * imports for onnx test
 import onnx
 import onnxruntime
-from learning.modules import ActorCritic
+from learning.modules import Actor
 from gym.utils.helpers import get_load_path
 from gym import LEGGED_GYM_ROOT_DIR
 
@@ -27,21 +27,18 @@ def learn_policy(args):
 
 
 def load_saved_policy(runner):
-    num_actor_obs = runner.get_obs_size(runner.policy_cfg["actor_obs"])
-    num_critic_obs = runner.get_obs_size(runner.policy_cfg["critic_obs"])
-    num_actions = runner.get_action_size(runner.policy_cfg["actions"])
-    actor_critic = ActorCritic(
-        num_actor_obs, num_critic_obs, num_actions, **runner.policy_cfg
-    ).to(runner.device)
+    num_actor_obs = runner.get_obs_size(runner.actor_cfg["obs"])
+    num_actions = runner.get_action_size(runner.actor_cfg["actions"])
+    actor = Actor(num_actor_obs, num_actions, **runner.actor_cfg).to(runner.device)
     resume_path = get_load_path(
         name=runner.cfg["experiment_name"],
         load_run=runner.cfg["load_run"],
         checkpoint=runner.cfg["checkpoint"],
     )
-    loaded_dict = torch.load(resume_path)
-    actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"])
-    actor_critic.eval()
-    return actor_critic
+    loaded_dict = torch.load(resume_path, weights_only=True)
+    actor.load_state_dict(loaded_dict["actor_state_dict"])
+    actor.eval()
+    return actor
 
 
 class TestDefaultIntegration:
@@ -60,7 +57,7 @@ def test_default_integration_settings(self, args):
 
         with torch.no_grad():
             actions = runner.get_inference_actions()
-            deployed_actions = runner.env.get_states(runner.policy_cfg["actions"])
+            deployed_actions = runner.env.get_states(runner.actor_cfg["actions"])
         assert (
             torch.equal(actions, torch.zeros_like(actions)) is False
         ), "Policy returning all zeros"
@@ -95,20 +92,20 @@ def test_default_integration_settings(self, args):
             f"{model_5_path}" "(last iteration) was not saved"
         )
 
-        obs = torch.randn_like(runner.get_obs(runner.policy_cfg["actor_obs"]))
-        actions_first = runner.alg.actor_critic.act_inference(obs).cpu().clone()
+        obs = torch.randn_like(runner.get_obs(runner.actor_cfg["obs"]))
+        actions_first = runner.alg.actor.act_inference(obs).cpu().clone()
         runner.load(model_8_path)
-        actions_loaded = runner.alg.actor_critic.act_inference(obs).cpu().clone()
+        actions_loaded = runner.alg.actor.act_inference(obs).cpu().clone()
 
         assert torch.equal(actions_first, actions_loaded), "Model loading failed"
 
         # * test onnx exporting
 
-        loaded_actor_critic = load_saved_policy(runner)
+        loaded_actor = load_saved_policy(runner)
         export_path = os.path.join(
             LEGGED_GYM_ROOT_DIR, "tests", "integration_tests", "exported_policy"
         )
-        loaded_actor_critic.export_policy(export_path)
+        loaded_actor.export(export_path)
 
         # load the exported onnx
         path_to_onnx = os.path.join(export_path, "policy.onnx")
@@ -121,9 +118,9 @@ def test_default_integration_settings(self, args):
         )
         # compute torch output
         with torch.no_grad():
-            test_input = runner.get_obs(runner.policy_cfg["actor_obs"])[0:1]
-            runner_out = runner.alg.actor_critic.actor.act_inference(test_input)
-            loaded_out = loaded_actor_critic.actor.act_inference(test_input)
+            test_input = runner.get_obs(runner.actor_cfg["obs"])[0]
+            runner_out = runner.alg.actor.act_inference(test_input)
+            loaded_out = loaded_actor.act_inference(test_input)
 
         # compute ONNX Runtime output prediction
         ort_inputs = {ort_session.get_inputs()[0].name: test_input.cpu().numpy()}
@@ -131,8 +128,8 @@ def test_default_integration_settings(self, args):
 
         # compare ONNX Runtime and PyTorch results
         torch.testing.assert_close(
-            runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-8
+            runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-7
         )
         torch.testing.assert_close(
-            runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-8
+            runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-7
         )
diff --git a/tests/regression_tests/run_n_iterations.py b/tests/regression_tests/run_n_iterations.py
index 6a3a4965..29e2f424 100644
--- a/tests/regression_tests/run_n_iterations.py
+++ b/tests/regression_tests/run_n_iterations.py
@@ -33,7 +33,7 @@ def _run_training_test(output_tensor_file, env, runner):
     runner.learn()
 
     # * get the test values after learning
-    actions = runner.env.get_states(runner.policy_cfg["actions"])
+    actions = runner.env.get_states(runner.actor_cfg["actions"])
 
     # * return the values to the parent for assertion
     actions.detach().cpu()
diff --git a/tests/regression_tests/test_generated_torch_files.py b/tests/regression_tests/test_generated_torch_files.py
index ef8ddc15..11b9ff5a 100644
--- a/tests/regression_tests/test_generated_torch_files.py
+++ b/tests/regression_tests/test_generated_torch_files.py
@@ -2,8 +2,8 @@
 
 
 def compare_tensors(file1, file2):
-    tensor1 = torch.load(file1)
-    tensor2 = torch.load(file2)
+    tensor1 = torch.load(file1, weights_only=True)
+    tensor2 = torch.load(file2, weights_only=True)
     assert torch.all(
         torch.eq(tensor1, tensor2)
     ), f"Tensors in {file1} and {file2} are not equal."