From bb85463c8c8cbd981673262a396f615ee38ba3fa Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 28 Feb 2024 13:30:12 -0500
Subject: [PATCH 01/42] smooth noise sampling and started gSDE

---
 .gitignore                                    |   2 +
 .../mini_cheetah/mini_cheetah_ref_config.py   |   2 +-
 learning/algorithms/ppo.py                    |   2 +-
 learning/modules/actor.py                     |  19 ++
 learning/modules/actor_critic.py              |  15 ++
 learning/modules/utils/__init__.py            |   1 +
 learning/modules/utils/gSDE.py                | 191 ++++++++++++++++++
 learning/modules/utils/neural_net.py          |   9 +-
 learning/runners/on_policy_runner.py          |   4 +
 plots/plot.py                                 |  19 ++
 10 files changed, 260 insertions(+), 4 deletions(-)
 create mode 100644 learning/modules/utils/gSDE.py
 create mode 100644 plots/plot.py

diff --git a/.gitignore b/.gitignore
index 0b68375a..d2761927 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,8 @@ gym/wandb
 *.npz
 user/wandb_config.json
 *trajectories/
+plots/*.csv
+plots/*.png
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 41bb634f..4db376ce 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -67,7 +67,7 @@ class scaling(MiniCheetahCfg.scaling):
 
 class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg):
     seed = -1
-    runner_class_name = "MyRunner"
+    runner_class_name = "OnPolicyRunner"
 
     class policy(MiniCheetahRunnerCfg.policy):
         actor_hidden_dims = [256, 256, 128]
diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py
index 9c7454bd..3ca5a5b4 100644
--- a/learning/algorithms/ppo.py
+++ b/learning/algorithms/ppo.py
@@ -152,7 +152,7 @@ def update(self):
             old_mu_batch,
             old_sigma_batch,
         ) in generator:
-            self.actor_critic.act(obs_batch)
+            self.actor_critic.actor.update_distribution(obs_batch)
             actions_log_prob_batch = self.actor_critic.get_actions_log_prob(
                 actions_batch
             )
diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index f9c82396..e588ea5e 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -37,6 +37,9 @@ def __init__(
         self.distribution = None
         # disable args validation for speedup
         Normal.set_default_validate_args = False
+        # smooth exploration
+        self.use_smooth_expl = True
+        self.episode_noise = None
 
     @property
     def action_mean(self):
@@ -58,8 +61,24 @@ def update_distribution(self, observations):
 
     def act(self, observations):
         self.update_distribution(observations)
+        if self.use_smooth_expl:
+            return self.act_smooth()
         return self.distribution.sample()
 
+    def act_smooth(self):
+        mean = self.distribution.mean
+        if self.episode_noise is None:
+            sample = self.distribution.sample()
+            self.episode_noise = sample - self.distribution.mean
+        else:
+            sample = mean + self.episode_noise
+        # write to csv
+        with open(
+            "/home/lmolnar/workspace/QGym/plots/distribution_smooth.csv", "a"
+        ) as f:
+            f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
+        return sample
+
     def get_actions_log_prob(self, actions):
         return self.distribution.log_prob(actions).sum(dim=-1)
 
diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py
index ed837f5e..91365fcc 100644
--- a/learning/modules/actor_critic.py
+++ b/learning/modules/actor_critic.py
@@ -2,6 +2,7 @@
 
 from .actor import Actor
 from .critic import Critic
+from .utils import StateDependentNoiseDistribution
 
 
 class ActorCritic(nn.Module):
@@ -15,6 +16,7 @@ def __init__(
         activation="elu",
         init_noise_std=1.0,
         normalize_obs=True,
+        use_sde=True,
         **kwargs,
     ):
         if kwargs:
@@ -40,6 +42,19 @@ def __init__(
         print(f"Actor MLP: {self.actor.NN}")
         print(f"Critic MLP: {self.critic.NN}")
 
+        # TODO[lm]: Decide how to handle the state dependent noise distribution in
+        # this class. In stable-baselines3 there is a class MlpExtractor which does
+        # what the Actor and Critic classes do here, just with the latent representation
+        # of the networks. Either I make a new class in a similar way and store the
+        # action distribution here, or I make the changes in Actor and Critic and change
+        # the distribution there.
+        if use_sde:
+            self.action_dist = StateDependentNoiseDistribution(
+                num_actions,
+                num_actor_obs,
+                num_critic_obs,
+            )
+
     @property
     def action_mean(self):
         return self.actor.action_mean
diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py
index 1422b82d..3bc82600 100644
--- a/learning/modules/utils/__init__.py
+++ b/learning/modules/utils/__init__.py
@@ -1,2 +1,3 @@
 from .neural_net import create_MLP, export_network
 from .normalize import RunningMeanStd
+from .gSDE import StateDependentNoiseDistribution
diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py
new file mode 100644
index 00000000..2920b796
--- /dev/null
+++ b/learning/modules/utils/gSDE.py
@@ -0,0 +1,191 @@
+from typing import Tuple, Optional
+
+import torch
+import torch.nn as nn
+from torch.distributions import Normal
+
+
+# The follow implementation was used in the gSDE paper on smooth exploration. See code:
+# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
+# TODO[lm]: Need to update some of the naming conventions to fit our codebase.
+class StateDependentNoiseDistribution:
+    """
+    Distribution class for using generalized State Dependent Exploration (gSDE).
+    It is used to create the noise exploration matrix and compute the log
+    probability of an action with that noise.
+    """
+
+    latent_sde_dim: Optional[int]
+    weights_dist: Normal
+    _latent_sde: torch.Tensor
+    exploration_mat: torch.Tensor
+    exploration_matrices: torch.Tensor
+
+    def __init__(
+        self,
+        num_actions: int,
+        num_actor_obs: int,
+        num_critic_obs: int,
+        full_std: bool = True,
+        use_expln: bool = False,
+        learn_features: bool = False,
+        epsilon: float = 1e-6,
+    ):
+        self.num_actions = num_actions
+        self.num_actor_obs = num_actor_obs
+        self.num_critic_obs = num_critic_obs
+        self.full_std = full_std
+        self.use_expln = use_expln
+        self.learn_features = learn_features
+        self.epsilon = epsilon
+        self.mean_actions = None
+        self.log_std = None
+
+    def get_std(self, log_std: torch.Tensor) -> torch.Tensor:
+        """
+        Get the standard deviation from the learned parameter
+        (log of it by default). This ensures that the std is positive.
+
+        :param log_std:
+        :return:
+        """
+        if self.use_expln:
+            # From gSDE paper, it allows to keep variance
+            # above zero and prevent it from growing too fast
+            below_threshold = torch.exp(log_std) * (log_std <= 0)
+            # Avoid NaN: zeros values that are below zero
+            safe_log_std = log_std * (log_std > 0) + self.epsilon
+            above_threshold = (torch.log1p(safe_log_std) + 1.0) * (log_std > 0)
+            std = below_threshold + above_threshold
+        else:
+            # Use normal exponential
+            std = torch.exp(log_std)
+
+        if self.full_std:
+            return std
+        assert self.latent_sde_dim is not None
+        # Reduce the number of parameters:
+        return torch.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std
+
+    def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None:
+        """
+        Sample weights for the noise exploration matrix,
+        using a centered Gaussian distribution.
+
+        :param log_std:
+        :param batch_size:
+        """
+        std = self.get_std(log_std)
+        self.weights_dist = Normal(torch.zeros_like(std), std)
+        # Reparametrization trick to pass gradients
+        self.exploration_mat = self.weights_dist.rsample()
+        # Pre-compute matrices in case of parallel exploration
+        self.exploration_matrices = self.weights_dist.rsample((batch_size,))
+
+    def proba_distribution_net(
+        self,
+        latent_dim: int,
+        log_std_init: float = -2.0,
+        latent_sde_dim: Optional[int] = None,
+    ) -> Tuple[nn.Module, nn.Parameter]:
+        """
+        Create the layers and parameter that represent the distribution:
+        one output will be the deterministic action, the other parameter will be the
+        standard deviation of the distribution that control the weights of the noise
+        matrix.
+
+        :param latent_dim: Dimension of the last layer of the policy (before the
+        action layer)
+        :param log_std_init: Initial value for the log standard deviation
+        :param latent_sde_dim: Dimension of the last layer of the features extractor
+            for gSDE. By default, it is shared with the policy network.
+        :return:
+        """
+        # Network for the deterministic action, it represents the mean of the
+        # distribution
+        mean_actions_net = nn.Linear(latent_dim, self.action_dim)
+        # When we learn features for the noise, the feature dimension
+        # can be different between the policy and the noise network
+        self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim
+        # Reduce the number of parameters if needed
+        log_std = (
+            torch.ones(self.latent_sde_dim, self.action_dim)
+            if self.full_std
+            else torch.ones(self.latent_sde_dim, 1)
+        )
+        # Transform it to a parameter so it can be optimized
+        log_std = nn.Parameter(log_std * log_std_init, requires_grad=True)
+        # Sample an exploration matrix
+        self.sample_weights(log_std)
+        return mean_actions_net, log_std
+
+    def proba_distribution(
+        self,
+        mean_actions: torch.Tensor,
+        log_std: torch.Tensor,
+        latent_sde: torch.Tensor,
+    ) -> "StateDependentNoiseDistribution":
+        """
+        Create the distribution given its parameters (mean, std)
+
+        :param mean_actions:
+        :param log_std:
+        :param latent_sde:
+        :return:
+        """
+        # Stop gradient if we don't want to influence the features
+        self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
+        variance = torch.mm(self._latent_sde**2, self.get_std(log_std) ** 2)
+        self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
+        return self
+
+    def log_prob(self, actions: torch.Tensor):
+        # TODO[lm]: verify that summed correctly (stable-baselines3 is different)
+        return self.distribution.log_prob(actions).sum(-1)
+
+    def entropy(self):
+        # TODO[lm]: verify that summed correctly (stable-baselines3 is different)
+        return self.distribution.entropy().sum(-1)
+
+    def sample(self) -> torch.Tensor:
+        noise = self.get_noise(self._latent_sde)
+        actions = self.distribution.mean + noise
+        return actions
+
+    def mode(self) -> torch.Tensor:
+        return self.distribution.mean
+
+    def get_noise(self, latent_sde: torch.Tensor) -> torch.Tensor:
+        latent_sde = latent_sde if self.learn_features else latent_sde.detach()
+        # Default case: only one exploration matrix
+        if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices):
+            return torch.mm(latent_sde, self.exploration_mat)
+        # Use batch matrix multiplication for efficient computation
+        # (batch_size, n_features) -> (batch_size, 1, n_features)
+        latent_sde = latent_sde.unsqueeze(dim=1)
+        # (batch_size, 1, n_actions)
+        noise = torch.bmm(latent_sde, self.exploration_matrices)
+        return noise.squeeze(dim=1)
+
+    def actions_from_params(
+        self,
+        mean_actions: torch.Tensor,
+        log_std: torch.Tensor,
+        latent_sde: torch.Tensor,
+        deterministic: bool = False,
+    ) -> torch.Tensor:
+        # Update the proba distribution
+        self.proba_distribution(mean_actions, log_std, latent_sde)
+        if deterministic:
+            return self.mode()
+        return self.sample()
+
+    def log_prob_from_params(
+        self,
+        mean_actions: torch.Tensor,
+        log_std: torch.Tensor,
+        latent_sde: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        actions = self.actions_from_params(mean_actions, log_std, latent_sde)
+        log_prob = self.log_prob(actions)
+        return actions, log_prob
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 5fd6ab3b..fa25d77b 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -3,7 +3,9 @@
 import copy
 
 
-def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None):
+def create_MLP(
+    num_inputs, num_outputs, hidden_dims, activation, dropouts=None, latent=False
+):
     activation = get_activation(activation)
 
     if dropouts is None:
@@ -15,7 +17,10 @@ def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None):
     else:
         add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0])
         for i in range(len(hidden_dims)):
-            if i == len(hidden_dims) - 1:
+            # TODO[lm]: Could also create a separate function that gives the latent
+            # reprentation used for smooth exploration (but if it doesn't mess up
+            # anything, this is simpler)
+            if i == len(hidden_dims) - 1 and not latent:
                 add_layer(layers, hidden_dims[i], num_outputs)
             else:
                 add_layer(
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index e2fb524e..9bf7c9f7 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -37,6 +37,10 @@ def learn(self):
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
+                    # Reset noise for smooth exploration
+                    if self.alg.actor_critic.actor.use_smooth_expl and i == 0:
+                        self.alg.actor_critic.actor.episode_noise = None
+
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.policy_cfg["actions"],
diff --git a/plots/plot.py b/plots/plot.py
new file mode 100644
index 00000000..2a33eafa
--- /dev/null
+++ b/plots/plot.py
@@ -0,0 +1,19 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Read the CSV file
+name = "distribution_smooth"
+data = pd.read_csv(name + ".csv")
+
+# Plot the data
+n = 200
+plt.plot(data.iloc[:n, 0])
+plt.plot(data.iloc[:n, 1])
+plt.xlabel("timestep")
+plt.ylabel("action")
+plt.title("Smoothing every rollout")
+plt.legend(["mean", "sample"])
+# plt.show()
+
+# Save the plot to a file
+plt.savefig(name + ".png")

From 243cf0f787b268d4b6f75e35b6524e3de911108b Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 1 Mar 2024 10:41:25 -0500
Subject: [PATCH 02/42] adress comments

---
 gym/envs/base/legged_robot_config.py          |  1 +
 gym/envs/mini_cheetah/mini_cheetah_config.py  |  1 +
 .../mini_cheetah/mini_cheetah_osc_config.py   |  1 +
 .../mini_cheetah/mini_cheetah_ref_config.py   |  1 +
 gym/envs/mit_humanoid/mit_humanoid_config.py  |  1 +
 learning/modules/actor.py                     | 44 +++++++++++--------
 learning/modules/actor_critic.py              | 32 +++++++++-----
 learning/modules/utils/gSDE.py                | 29 +++---------
 learning/runners/on_policy_runner.py          |  2 +-
 9 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py
index 0de83e0e..144f8323 100644
--- a/gym/envs/base/legged_robot_config.py
+++ b/gym/envs/base/legged_robot_config.py
@@ -240,6 +240,7 @@ class policy:
         # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
         normalize_obs = True
+        smooth_exploration = False
 
         actor_obs = [
             "observation_a",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
index de015b21..ef9896e0 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -130,6 +130,7 @@ class policy(LeggedRobotRunnerCfg.policy):
         critic_hidden_dims = [128, 64]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         actor_obs = [
             "base_lin_vel",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
index a08d5179..45191a9f 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
@@ -166,6 +166,7 @@ class policy(MiniCheetahRunnerCfg.policy):
         critic_hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         actor_obs = [
             "base_ang_vel",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 4db376ce..2f894bde 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -74,6 +74,7 @@ class policy(MiniCheetahRunnerCfg.policy):
         critic_hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = True
 
         actor_obs = [
             "base_ang_vel",
diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py
index 3439e58c..25db6163 100644
--- a/gym/envs/mit_humanoid/mit_humanoid_config.py
+++ b/gym/envs/mit_humanoid/mit_humanoid_config.py
@@ -187,6 +187,7 @@ class policy(LeggedRobotRunnerCfg.policy):
         critic_hidden_dims = [512, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         actor_obs = [
             "base_height",
diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index e588ea5e..857d7326 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -5,6 +5,8 @@
 from .utils import export_network
 from .utils import RunningMeanStd
 
+from gym import LEGGED_GYM_ROOT_DIR
+
 
 class Actor(nn.Module):
     def __init__(
@@ -37,9 +39,6 @@ def __init__(
         self.distribution = None
         # disable args validation for speedup
         Normal.set_default_validate_args = False
-        # smooth exploration
-        self.use_smooth_expl = True
-        self.episode_noise = None
 
     @property
     def action_mean(self):
@@ -61,24 +60,8 @@ def update_distribution(self, observations):
 
     def act(self, observations):
         self.update_distribution(observations)
-        if self.use_smooth_expl:
-            return self.act_smooth()
         return self.distribution.sample()
 
-    def act_smooth(self):
-        mean = self.distribution.mean
-        if self.episode_noise is None:
-            sample = self.distribution.sample()
-            self.episode_noise = sample - self.distribution.mean
-        else:
-            sample = mean + self.episode_noise
-        # write to csv
-        with open(
-            "/home/lmolnar/workspace/QGym/plots/distribution_smooth.csv", "a"
-        ) as f:
-            f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
-        return sample
-
     def get_actions_log_prob(self, actions):
         return self.distribution.log_prob(actions).sum(dim=-1)
 
@@ -97,3 +80,26 @@ def normalize(self, observation):
 
     def export(self, path):
         export_network(self, "policy", path, self.num_obs)
+
+
+class SmoothActor(Actor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Noise for smooth exploration
+        self.episode_noise = None
+        self.debug = False
+
+    def act(self, observations):
+        self.update_distribution(observations)
+        mean = self.distribution.mean
+        if self.episode_noise is None:
+            sample = self.distribution.sample()
+            self.episode_noise = sample - self.distribution.mean
+        else:
+            sample = mean + self.episode_noise
+
+        if self.debug:
+            # write to csv (used for plotting)
+            with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f:
+                f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
+        return sample
diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py
index 91365fcc..669a9f6a 100644
--- a/learning/modules/actor_critic.py
+++ b/learning/modules/actor_critic.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 
-from .actor import Actor
+from .actor import Actor, SmoothActor
 from .critic import Critic
 from .utils import StateDependentNoiseDistribution
 
@@ -16,7 +16,7 @@ def __init__(
         activation="elu",
         init_noise_std=1.0,
         normalize_obs=True,
-        use_sde=True,
+        smooth_exploration=False,
         **kwargs,
     ):
         if kwargs:
@@ -26,14 +26,24 @@ def __init__(
             )
         super(ActorCritic, self).__init__()
 
-        self.actor = Actor(
-            num_actor_obs,
-            num_actions,
-            actor_hidden_dims,
-            activation,
-            init_noise_std,
-            normalize_obs,
-        )
+        if smooth_exploration:
+            self.actor = SmoothActor(
+                num_actor_obs,
+                num_actions,
+                actor_hidden_dims,
+                activation,
+                init_noise_std,
+                normalize_obs,
+            )
+        else:
+            self.actor = Actor(
+                num_actor_obs,
+                num_actions,
+                actor_hidden_dims,
+                activation,
+                init_noise_std,
+                normalize_obs,
+            )
 
         self.critic = Critic(
             num_critic_obs, critic_hidden_dims, activation, normalize_obs
@@ -48,7 +58,7 @@ def __init__(
         # of the networks. Either I make a new class in a similar way and store the
         # action distribution here, or I make the changes in Actor and Critic and change
         # the distribution there.
-        if use_sde:
+        if smooth_exploration:
             self.action_dist = StateDependentNoiseDistribution(
                 num_actions,
                 num_actor_obs,
diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py
index 2920b796..6e04b0e9 100644
--- a/learning/modules/utils/gSDE.py
+++ b/learning/modules/utils/gSDE.py
@@ -5,7 +5,8 @@
 from torch.distributions import Normal
 
 
-# The follow implementation was used in the gSDE paper on smooth exploration. See code:
+# The following implementation was used in the gSDE paper on smooth exploration.
+# See code:
 # https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
 # TODO[lm]: Need to update some of the naming conventions to fit our codebase.
 class StateDependentNoiseDistribution:
@@ -45,9 +46,6 @@ def get_std(self, log_std: torch.Tensor) -> torch.Tensor:
         """
         Get the standard deviation from the learned parameter
         (log of it by default). This ensures that the std is positive.
-
-        :param log_std:
-        :return:
         """
         if self.use_expln:
             # From gSDE paper, it allows to keep variance
@@ -71,9 +69,6 @@ def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None:
         """
         Sample weights for the noise exploration matrix,
         using a centered Gaussian distribution.
-
-        :param log_std:
-        :param batch_size:
         """
         std = self.get_std(log_std)
         self.weights_dist = Normal(torch.zeros_like(std), std)
@@ -82,7 +77,7 @@ def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None:
         # Pre-compute matrices in case of parallel exploration
         self.exploration_matrices = self.weights_dist.rsample((batch_size,))
 
-    def proba_distribution_net(
+    def get_distribution_net(
         self,
         latent_dim: int,
         log_std_init: float = -2.0,
@@ -93,13 +88,6 @@ def proba_distribution_net(
         one output will be the deterministic action, the other parameter will be the
         standard deviation of the distribution that control the weights of the noise
         matrix.
-
-        :param latent_dim: Dimension of the last layer of the policy (before the
-        action layer)
-        :param log_std_init: Initial value for the log standard deviation
-        :param latent_sde_dim: Dimension of the last layer of the features extractor
-            for gSDE. By default, it is shared with the policy network.
-        :return:
         """
         # Network for the deterministic action, it represents the mean of the
         # distribution
@@ -119,7 +107,7 @@ def proba_distribution_net(
         self.sample_weights(log_std)
         return mean_actions_net, log_std
 
-    def proba_distribution(
+    def get_distribution(
         self,
         mean_actions: torch.Tensor,
         log_std: torch.Tensor,
@@ -127,11 +115,6 @@ def proba_distribution(
     ) -> "StateDependentNoiseDistribution":
         """
         Create the distribution given its parameters (mean, std)
-
-        :param mean_actions:
-        :param log_std:
-        :param latent_sde:
-        :return:
         """
         # Stop gradient if we don't want to influence the features
         self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
@@ -174,8 +157,8 @@ def actions_from_params(
         latent_sde: torch.Tensor,
         deterministic: bool = False,
     ) -> torch.Tensor:
-        # Update the proba distribution
-        self.proba_distribution(mean_actions, log_std, latent_sde)
+        # Update the distribution
+        self.get_distribution(mean_actions, log_std, latent_sde)
         if deterministic:
             return self.mode()
         return self.sample()
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 9bf7c9f7..1cc3e0ef 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -38,7 +38,7 @@ def learn(self):
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
                     # Reset noise for smooth exploration
-                    if self.alg.actor_critic.actor.use_smooth_expl and i == 0:
+                    if self.policy_cfg["smooth_exploration"] and i == 0:
                         self.alg.actor_critic.actor.episode_noise = None
 
                     actions = self.alg.act(actor_obs, critic_obs)

From d838dc6d140b648f84300efd4f92c9f714745e1c Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 1 Mar 2024 11:34:49 -0500
Subject: [PATCH 03/42] start  moving things to SmoothActor

---
 learning/modules/actor.py            | 22 +++++++++++++++++++++-
 learning/modules/actor_critic.py     | 14 --------------
 learning/modules/critic.py           |  2 +-
 learning/modules/utils/gSDE.py       |  2 --
 learning/modules/utils/neural_net.py |  5 +++--
 5 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index 857d7326..c8ffe255 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -4,6 +4,7 @@
 from .utils import create_MLP
 from .utils import export_network
 from .utils import RunningMeanStd
+from .utils import StateDependentNoiseDistribution
 
 from gym import LEGGED_GYM_ROOT_DIR
 
@@ -32,7 +33,11 @@ def __init__(
 
         self.num_obs = num_obs
         self.num_actions = num_actions
-        self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation)
+        self.hidden_dims = hidden_dims
+        self.activation = activation
+        self.NN = create_MLP(
+            num_obs, num_actions, hidden_dims, activation, latent=False
+        )
 
         # Action noise
         self.std = nn.Parameter(init_noise_std * torch.ones(num_actions))
@@ -85,11 +90,26 @@ def export(self, path):
 class SmoothActor(Actor):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        # Create latent NN
+        self.NN = create_MLP(
+            self.num_obs,
+            self.num_actions,
+            self.hidden_dims,
+            self.activation,
+            latent=True,
+        )
+        # State dependent action distribution
+        self.distribution = StateDependentNoiseDistribution(
+            self.num_actions,
+            self.num_obs,
+        )
         # Noise for smooth exploration
         self.episode_noise = None
+        # Debug mode for plotting
         self.debug = False
 
     def act(self, observations):
+        # TODO[lm]: update distribution for gSDE
         self.update_distribution(observations)
         mean = self.distribution.mean
         if self.episode_noise is None:
diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py
index 669a9f6a..770fdc71 100644
--- a/learning/modules/actor_critic.py
+++ b/learning/modules/actor_critic.py
@@ -2,7 +2,6 @@
 
 from .actor import Actor, SmoothActor
 from .critic import Critic
-from .utils import StateDependentNoiseDistribution
 
 
 class ActorCritic(nn.Module):
@@ -52,19 +51,6 @@ def __init__(
         print(f"Actor MLP: {self.actor.NN}")
         print(f"Critic MLP: {self.critic.NN}")
 
-        # TODO[lm]: Decide how to handle the state dependent noise distribution in
-        # this class. In stable-baselines3 there is a class MlpExtractor which does
-        # what the Actor and Critic classes do here, just with the latent representation
-        # of the networks. Either I make a new class in a similar way and store the
-        # action distribution here, or I make the changes in Actor and Critic and change
-        # the distribution there.
-        if smooth_exploration:
-            self.action_dist = StateDependentNoiseDistribution(
-                num_actions,
-                num_actor_obs,
-                num_critic_obs,
-            )
-
     @property
     def action_mean(self):
         return self.actor.action_mean
diff --git a/learning/modules/critic.py b/learning/modules/critic.py
index 47e0decf..430eca33 100644
--- a/learning/modules/critic.py
+++ b/learning/modules/critic.py
@@ -20,7 +20,7 @@ def __init__(
             )
         super().__init__()
 
-        self.NN = create_MLP(num_obs, 1, hidden_dims, activation)
+        self.NN = create_MLP(num_obs, 1, hidden_dims, activation, latent=False)
         self._normalize_obs = normalize_obs
         if self._normalize_obs:
             self.obs_rms = RunningMeanStd(num_obs)
diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py
index 6e04b0e9..cd100053 100644
--- a/learning/modules/utils/gSDE.py
+++ b/learning/modules/utils/gSDE.py
@@ -26,7 +26,6 @@ def __init__(
         self,
         num_actions: int,
         num_actor_obs: int,
-        num_critic_obs: int,
         full_std: bool = True,
         use_expln: bool = False,
         learn_features: bool = False,
@@ -34,7 +33,6 @@ def __init__(
     ):
         self.num_actions = num_actions
         self.num_actor_obs = num_actor_obs
-        self.num_critic_obs = num_critic_obs
         self.full_std = full_std
         self.use_expln = use_expln
         self.learn_features = learn_features
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index fa25d77b..057e724f 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -20,8 +20,9 @@ def create_MLP(
             # TODO[lm]: Could also create a separate function that gives the latent
             # reprentation used for smooth exploration (but if it doesn't mess up
             # anything, this is simpler)
-            if i == len(hidden_dims) - 1 and not latent:
-                add_layer(layers, hidden_dims[i], num_outputs)
+            if i == len(hidden_dims) - 1:
+                if not latent:
+                    add_layer(layers, hidden_dims[i], num_outputs)
             else:
                 add_layer(
                     layers,

From aea5d88fb736087108cf37e66aa0ea8093638514 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 1 Mar 2024 16:25:05 -0500
Subject: [PATCH 04/42] moved everything to SmoothActor (it runs)

---
 learning/modules/actor.py            |  41 -------
 learning/modules/actor_critic.py     |   3 +-
 learning/modules/smooth_actor.py     | 130 ++++++++++++++++++++
 learning/modules/utils/__init__.py   |   1 -
 learning/modules/utils/gSDE.py       | 172 ---------------------------
 learning/runners/on_policy_runner.py |   7 +-
 plots/plot.py                        |  10 +-
 7 files changed, 140 insertions(+), 224 deletions(-)
 create mode 100644 learning/modules/smooth_actor.py
 delete mode 100644 learning/modules/utils/gSDE.py

diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index c8ffe255..17cf0566 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -4,9 +4,6 @@
 from .utils import create_MLP
 from .utils import export_network
 from .utils import RunningMeanStd
-from .utils import StateDependentNoiseDistribution
-
-from gym import LEGGED_GYM_ROOT_DIR
 
 
 class Actor(nn.Module):
@@ -85,41 +82,3 @@ def normalize(self, observation):
 
     def export(self, path):
         export_network(self, "policy", path, self.num_obs)
-
-
-class SmoothActor(Actor):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # Create latent NN
-        self.NN = create_MLP(
-            self.num_obs,
-            self.num_actions,
-            self.hidden_dims,
-            self.activation,
-            latent=True,
-        )
-        # State dependent action distribution
-        self.distribution = StateDependentNoiseDistribution(
-            self.num_actions,
-            self.num_obs,
-        )
-        # Noise for smooth exploration
-        self.episode_noise = None
-        # Debug mode for plotting
-        self.debug = False
-
-    def act(self, observations):
-        # TODO[lm]: update distribution for gSDE
-        self.update_distribution(observations)
-        mean = self.distribution.mean
-        if self.episode_noise is None:
-            sample = self.distribution.sample()
-            self.episode_noise = sample - self.distribution.mean
-        else:
-            sample = mean + self.episode_noise
-
-        if self.debug:
-            # write to csv (used for plotting)
-            with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f:
-                f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
-        return sample
diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py
index 770fdc71..29f6cdf7 100644
--- a/learning/modules/actor_critic.py
+++ b/learning/modules/actor_critic.py
@@ -1,6 +1,7 @@
 import torch.nn as nn
 
-from .actor import Actor, SmoothActor
+from .actor import Actor
+from .smooth_actor import SmoothActor
 from .critic import Critic
 
 
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
new file mode 100644
index 00000000..6c6a42c7
--- /dev/null
+++ b/learning/modules/smooth_actor.py
@@ -0,0 +1,130 @@
+import torch
+import torch.nn as nn
+from torch.distributions import Normal
+
+from .actor import Actor
+from .utils import create_MLP
+
+from gym import LEGGED_GYM_ROOT_DIR
+
+
+# The following implementation is based on the gSDE paper. See code:
+# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
+class SmoothActor(Actor):
+    weights_dist: Normal
+    _latent_sde: torch.Tensor
+    exploration_mat: torch.Tensor
+    exploration_matrices: torch.Tensor
+
+    def __init__(
+        self,
+        *args,
+        full_std: bool = True,
+        use_exp_ln: bool = False,
+        learn_features: bool = False,
+        epsilon: float = 1e-6,
+        log_std_init: float = -2.0,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.full_std = full_std
+        self.use_expln = use_exp_ln
+        self.learn_features = learn_features
+        self.epsilon = epsilon
+        self.log_std_init = log_std_init
+
+        # Create latent NN and last layer
+        self.latent_net = create_MLP(
+            self.num_obs,
+            self.num_actions,
+            self.hidden_dims,
+            self.activation,
+            latent=True,
+        )
+        self.latent_dim = self.hidden_dims[-1]
+        self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions)
+        # Reduce the number of parameters if needed
+        if self.full_std:
+            log_std = torch.ones(self.latent_dim, self.num_actions)
+        else:
+            log_std = torch.ones(self.latent_dim, 1)
+        self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True)
+        # Sample an exploration matrix (this sets the exploration matrices)
+        self.sample_weights()
+        self.distribution = None
+
+        # Debug mode for plotting
+        self.debug = False
+
+    def sample_weights(self, batch_size=1):
+        # Sample weights for the noise exploration matrix
+        std = self.get_std()
+        self.weights_dist = Normal(torch.zeros_like(std), std)
+        # Reparametrization trick to pass gradients
+        self.exploration_mat = self.weights_dist.rsample()
+        # Pre-compute matrices in case of parallel exploration
+        self.exploration_matrices = self.weights_dist.rsample((batch_size,))
+
+    def get_std(self):
+        # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally
+        if self.use_expln:
+            # From gSDE paper, it allows to keep variance
+            # above zero and prevent it from growing too fast
+            below_threshold = torch.exp(self.log_std) * (self.log_std <= 0)
+            # Avoid NaN: zeros values that are below zero
+            safe_log_std = self.log_std * (self.log_std > 0) + self.epsilon
+            above_threshold = (torch.log1p(safe_log_std) + 1.0) * (self.log_std > 0)
+            std = below_threshold + above_threshold
+        else:
+            # Use normal exponential
+            std = torch.exp(self.log_std)
+
+        if self.full_std:
+            return std
+        assert self.latent_dim is not None
+        # Reduce the number of parameters:
+        return (
+            torch.ones(self.latent_dim, self.num_actions).to(self.log_std.device) * std
+        )
+
+    def update_distribution(self, observations):
+        if self._normalize_obs:
+            observations = self.normalize(observations)
+        # Get latent features and compute distribution
+        self._latent_sde = self.latent_net(observations)
+        if not self.learn_features:
+            self._latent_sde = self._latent_sde.detach()
+        variance = torch.mm(self._latent_sde**2, self.get_std() ** 2)
+        mean_actions = self.mean_actions_net(self._latent_sde)
+        self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
+
+    def act(self, observations):
+        self.update_distribution(observations)
+        mean = self.distribution.mean
+        sample = mean + self.get_noise()
+        if self.debug:
+            # write to csv (used for plotting)
+            with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f:
+                f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
+        return sample
+
+    def act_inference(self, observations):
+        if self._normalize_obs:
+            observations = self.normalize(observations)
+        latent_sde = self.latent_net(observations)
+        mean_actions = self.mean_actions_net(latent_sde)
+        return mean_actions
+
+    def get_noise(self):
+        latent_sde = self._latent_sde
+        if not self.learn_features:
+            latent_sde = latent_sde.detach()
+        # Default case: only one exploration matrix
+        if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices):
+            return torch.mm(latent_sde, self.exploration_mat.to(latent_sde.device))
+        # Use batch matrix multiplication for efficient computation
+        # (batch_size, n_features) -> (batch_size, 1, n_features)
+        latent_sde = latent_sde.unsqueeze(dim=1)
+        # (batch_size, 1, n_actions)
+        noise = torch.bmm(latent_sde, self.exploration_matrices)
+        return noise.squeeze(dim=1)
diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py
index 3bc82600..1422b82d 100644
--- a/learning/modules/utils/__init__.py
+++ b/learning/modules/utils/__init__.py
@@ -1,3 +1,2 @@
 from .neural_net import create_MLP, export_network
 from .normalize import RunningMeanStd
-from .gSDE import StateDependentNoiseDistribution
diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py
deleted file mode 100644
index cd100053..00000000
--- a/learning/modules/utils/gSDE.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from typing import Tuple, Optional
-
-import torch
-import torch.nn as nn
-from torch.distributions import Normal
-
-
-# The following implementation was used in the gSDE paper on smooth exploration.
-# See code:
-# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
-# TODO[lm]: Need to update some of the naming conventions to fit our codebase.
-class StateDependentNoiseDistribution:
-    """
-    Distribution class for using generalized State Dependent Exploration (gSDE).
-    It is used to create the noise exploration matrix and compute the log
-    probability of an action with that noise.
-    """
-
-    latent_sde_dim: Optional[int]
-    weights_dist: Normal
-    _latent_sde: torch.Tensor
-    exploration_mat: torch.Tensor
-    exploration_matrices: torch.Tensor
-
-    def __init__(
-        self,
-        num_actions: int,
-        num_actor_obs: int,
-        full_std: bool = True,
-        use_expln: bool = False,
-        learn_features: bool = False,
-        epsilon: float = 1e-6,
-    ):
-        self.num_actions = num_actions
-        self.num_actor_obs = num_actor_obs
-        self.full_std = full_std
-        self.use_expln = use_expln
-        self.learn_features = learn_features
-        self.epsilon = epsilon
-        self.mean_actions = None
-        self.log_std = None
-
-    def get_std(self, log_std: torch.Tensor) -> torch.Tensor:
-        """
-        Get the standard deviation from the learned parameter
-        (log of it by default). This ensures that the std is positive.
-        """
-        if self.use_expln:
-            # From gSDE paper, it allows to keep variance
-            # above zero and prevent it from growing too fast
-            below_threshold = torch.exp(log_std) * (log_std <= 0)
-            # Avoid NaN: zeros values that are below zero
-            safe_log_std = log_std * (log_std > 0) + self.epsilon
-            above_threshold = (torch.log1p(safe_log_std) + 1.0) * (log_std > 0)
-            std = below_threshold + above_threshold
-        else:
-            # Use normal exponential
-            std = torch.exp(log_std)
-
-        if self.full_std:
-            return std
-        assert self.latent_sde_dim is not None
-        # Reduce the number of parameters:
-        return torch.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std
-
-    def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None:
-        """
-        Sample weights for the noise exploration matrix,
-        using a centered Gaussian distribution.
-        """
-        std = self.get_std(log_std)
-        self.weights_dist = Normal(torch.zeros_like(std), std)
-        # Reparametrization trick to pass gradients
-        self.exploration_mat = self.weights_dist.rsample()
-        # Pre-compute matrices in case of parallel exploration
-        self.exploration_matrices = self.weights_dist.rsample((batch_size,))
-
-    def get_distribution_net(
-        self,
-        latent_dim: int,
-        log_std_init: float = -2.0,
-        latent_sde_dim: Optional[int] = None,
-    ) -> Tuple[nn.Module, nn.Parameter]:
-        """
-        Create the layers and parameter that represent the distribution:
-        one output will be the deterministic action, the other parameter will be the
-        standard deviation of the distribution that control the weights of the noise
-        matrix.
-        """
-        # Network for the deterministic action, it represents the mean of the
-        # distribution
-        mean_actions_net = nn.Linear(latent_dim, self.action_dim)
-        # When we learn features for the noise, the feature dimension
-        # can be different between the policy and the noise network
-        self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim
-        # Reduce the number of parameters if needed
-        log_std = (
-            torch.ones(self.latent_sde_dim, self.action_dim)
-            if self.full_std
-            else torch.ones(self.latent_sde_dim, 1)
-        )
-        # Transform it to a parameter so it can be optimized
-        log_std = nn.Parameter(log_std * log_std_init, requires_grad=True)
-        # Sample an exploration matrix
-        self.sample_weights(log_std)
-        return mean_actions_net, log_std
-
-    def get_distribution(
-        self,
-        mean_actions: torch.Tensor,
-        log_std: torch.Tensor,
-        latent_sde: torch.Tensor,
-    ) -> "StateDependentNoiseDistribution":
-        """
-        Create the distribution given its parameters (mean, std)
-        """
-        # Stop gradient if we don't want to influence the features
-        self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
-        variance = torch.mm(self._latent_sde**2, self.get_std(log_std) ** 2)
-        self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
-        return self
-
-    def log_prob(self, actions: torch.Tensor):
-        # TODO[lm]: verify that summed correctly (stable-baselines3 is different)
-        return self.distribution.log_prob(actions).sum(-1)
-
-    def entropy(self):
-        # TODO[lm]: verify that summed correctly (stable-baselines3 is different)
-        return self.distribution.entropy().sum(-1)
-
-    def sample(self) -> torch.Tensor:
-        noise = self.get_noise(self._latent_sde)
-        actions = self.distribution.mean + noise
-        return actions
-
-    def mode(self) -> torch.Tensor:
-        return self.distribution.mean
-
-    def get_noise(self, latent_sde: torch.Tensor) -> torch.Tensor:
-        latent_sde = latent_sde if self.learn_features else latent_sde.detach()
-        # Default case: only one exploration matrix
-        if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices):
-            return torch.mm(latent_sde, self.exploration_mat)
-        # Use batch matrix multiplication for efficient computation
-        # (batch_size, n_features) -> (batch_size, 1, n_features)
-        latent_sde = latent_sde.unsqueeze(dim=1)
-        # (batch_size, 1, n_actions)
-        noise = torch.bmm(latent_sde, self.exploration_matrices)
-        return noise.squeeze(dim=1)
-
-    def actions_from_params(
-        self,
-        mean_actions: torch.Tensor,
-        log_std: torch.Tensor,
-        latent_sde: torch.Tensor,
-        deterministic: bool = False,
-    ) -> torch.Tensor:
-        # Update the distribution
-        self.get_distribution(mean_actions, log_std, latent_sde)
-        if deterministic:
-            return self.mode()
-        return self.sample()
-
-    def log_prob_from_params(
-        self,
-        mean_actions: torch.Tensor,
-        log_std: torch.Tensor,
-        latent_sde: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        actions = self.actions_from_params(mean_actions, log_std, latent_sde)
-        log_prob = self.log_prob(actions)
-        return actions, log_prob
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 1cc3e0ef..4f68e164 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -34,13 +34,12 @@ def learn(self):
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
+            # * Re-sample noise matrix for smooth exploration
+            if self.policy_cfg["smooth_exploration"]:
+                self.alg.actor_critic.actor.sample_weights()
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
-                    # Reset noise for smooth exploration
-                    if self.policy_cfg["smooth_exploration"] and i == 0:
-                        self.alg.actor_critic.actor.episode_noise = None
-
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.policy_cfg["actions"],
diff --git a/plots/plot.py b/plots/plot.py
index 2a33eafa..adda11ce 100644
--- a/plots/plot.py
+++ b/plots/plot.py
@@ -5,13 +5,13 @@
 name = "distribution_smooth"
 data = pd.read_csv(name + ".csv")
 
-# Plot the data
-n = 200
-plt.plot(data.iloc[:n, 0])
-plt.plot(data.iloc[:n, 1])
+# Plot the data (last n steps)
+n = 500
+plt.plot(data.iloc[-n:, 0])
+plt.plot(data.iloc[-n:, 1])
 plt.xlabel("timestep")
 plt.ylabel("action")
-plt.title("Smoothing every rollout")
+plt.title("gSDE")
 plt.legend(["mean", "sample"])
 # plt.show()
 

From 3603205cf8ab988e04af4e0b69412768b04b4607 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 1 Mar 2024 18:21:56 -0500
Subject: [PATCH 05/42] possibly resample in PPO update

---
 learning/algorithms/ppo.py   | 6 +++++-
 learning/modules/__init__.py | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py
index 3ca5a5b4..ad31f846 100644
--- a/learning/algorithms/ppo.py
+++ b/learning/algorithms/ppo.py
@@ -34,7 +34,7 @@
 import torch.nn as nn
 import torch.optim as optim
 
-from learning.modules import ActorCritic
+from learning.modules import ActorCritic, SmoothActor
 from learning.storage import RolloutStorage
 
 
@@ -152,6 +152,10 @@ def update(self):
             old_mu_batch,
             old_sigma_batch,
         ) in generator:
+            # TODO[lm]: Look into resampling noise here, gSDE paper seems to do it.
+            if isinstance(self.actor_critic.actor, SmoothActor):
+                batch_size = obs_batch.shape[0]
+                self.actor_critic.actor.sample_weights(batch_size)
             self.actor_critic.actor.update_distribution(obs_batch)
             actions_log_prob_batch = self.actor_critic.get_actions_log_prob(
                 actions_batch
diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py
index 75cca5e2..86c96759 100644
--- a/learning/modules/__init__.py
+++ b/learning/modules/__init__.py
@@ -33,3 +33,4 @@
 from .actor_critic import ActorCritic
 from .state_estimator import StateEstimatorNN
 from .actor import Actor
+from .smooth_actor import SmoothActor

From bb0a273982a5b90f3b97bbbfca4c8f7851fa5b63 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 4 Mar 2024 11:04:36 -0500
Subject: [PATCH 06/42] learn_features=True and correct sample dim

---
 learning/modules/smooth_actor.py     | 4 ++--
 learning/runners/on_policy_runner.py | 2 +-
 plots/plot.py                        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index 6c6a42c7..200f03f2 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -21,7 +21,7 @@ def __init__(
         *args,
         full_std: bool = True,
         use_exp_ln: bool = False,
-        learn_features: bool = False,
+        learn_features: bool = True,
         epsilon: float = 1e-6,
         log_std_init: float = -2.0,
         **kwargs,
@@ -49,7 +49,7 @@ def __init__(
         else:
             log_std = torch.ones(self.latent_dim, 1)
         self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True)
-        # Sample an exploration matrix (this sets the exploration matrices)
+        # Sample an exploration matrix
         self.sample_weights()
         self.distribution = None
 
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 4f68e164..e183b816 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -36,7 +36,7 @@ def learn(self):
             logger.tic("collection")
             # * Re-sample noise matrix for smooth exploration
             if self.policy_cfg["smooth_exploration"]:
-                self.alg.actor_critic.actor.sample_weights()
+                self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs)
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
diff --git a/plots/plot.py b/plots/plot.py
index adda11ce..5de8ba35 100644
--- a/plots/plot.py
+++ b/plots/plot.py
@@ -10,7 +10,7 @@
 plt.plot(data.iloc[-n:, 0])
 plt.plot(data.iloc[-n:, 1])
 plt.xlabel("timestep")
-plt.ylabel("action")
+plt.ylabel("action (NN output)")
 plt.title("gSDE")
 plt.legend(["mean", "sample"])
 # plt.show()

From 40025c43de53627d9ca615c0aa40d9b88476755d Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 4 Mar 2024 12:01:29 -0500
Subject: [PATCH 07/42] adjust sample freq and update plotting

---
 gym/envs/mini_cheetah/mini_cheetah_ref_config.py |  1 +
 learning/modules/actor.py                        | 16 +++++++++++++++-
 learning/modules/smooth_actor.py                 |  7 +++----
 learning/runners/on_policy_runner.py             | 10 +++++++---
 plots/plot.py                                    |  4 ++--
 5 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 2f894bde..fda3f345 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -75,6 +75,7 @@ class policy(MiniCheetahRunnerCfg.policy):
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
         smooth_exploration = True
+        exploration_sample_freq = 8
 
         actor_obs = [
             "base_ang_vel",
diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index 17cf0566..25af26e6 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -5,6 +5,8 @@
 from .utils import export_network
 from .utils import RunningMeanStd
 
+from gym import LEGGED_GYM_ROOT_DIR
+
 
 class Actor(nn.Module):
     def __init__(
@@ -42,6 +44,9 @@ def __init__(
         # disable args validation for speedup
         Normal.set_default_validate_args = False
 
+        # Debug mode for plotting
+        self.debug = False
+
     @property
     def action_mean(self):
         return self.distribution.mean
@@ -62,7 +67,12 @@ def update_distribution(self, observations):
 
     def act(self, observations):
         self.update_distribution(observations)
-        return self.distribution.sample()
+        sample = self.distribution.sample()
+        if self.debug:
+            mean = self.distribution.mean
+            path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_baseline.csv"
+            self.log_actions(mean[0][2], sample[0][2], path)
+        return sample
 
     def get_actions_log_prob(self, actions):
         return self.distribution.log_prob(actions).sum(dim=-1)
@@ -82,3 +92,7 @@ def normalize(self, observation):
 
     def export(self, path):
         export_network(self, "policy", path, self.num_obs)
+
+    def log_actions(self, mean, sample, path):
+        with open(path, "a") as f:
+            f.write(str(mean.item()) + ", " + str(sample.item()) + "\n")
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index 200f03f2..bc262798 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -54,7 +54,7 @@ def __init__(
         self.distribution = None
 
         # Debug mode for plotting
-        self.debug = False
+        self.debug = True
 
     def sample_weights(self, batch_size=1):
         # Sample weights for the noise exploration matrix
@@ -103,9 +103,8 @@ def act(self, observations):
         mean = self.distribution.mean
         sample = mean + self.get_noise()
         if self.debug:
-            # write to csv (used for plotting)
-            with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f:
-                f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
+            path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv"
+            self.log_actions(mean[0][2], sample[0][2], path)
         return sample
 
     def act_inference(self, observations):
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index e183b816..3b7ce33e 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -34,12 +34,16 @@ def learn(self):
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
-            # * Re-sample noise matrix for smooth exploration
-            if self.policy_cfg["smooth_exploration"]:
-                self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs)
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
+                    # * Re-sample noise matrix for smooth exploration
+                    sample_freq = self.policy_cfg["exploration_sample_freq"]
+                    if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0:
+                        self.alg.actor_critic.actor.sample_weights(
+                            batch_size=self.env.num_envs
+                        )
+
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.policy_cfg["actions"],
diff --git a/plots/plot.py b/plots/plot.py
index 5de8ba35..5018cb4d 100644
--- a/plots/plot.py
+++ b/plots/plot.py
@@ -6,12 +6,12 @@
 data = pd.read_csv(name + ".csv")
 
 # Plot the data (last n steps)
-n = 500
+n = 200
 plt.plot(data.iloc[-n:, 0])
 plt.plot(data.iloc[-n:, 1])
 plt.xlabel("timestep")
 plt.ylabel("action (NN output)")
-plt.title("gSDE")
+plt.title("gSDE (sample_freq=8)")
 plt.legend(["mean", "sample"])
 # plt.show()
 

From c3017461e5ba7629e02dc75a5448fc4bf81e57c4 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 15 Mar 2024 14:17:22 -0400
Subject: [PATCH 08/42] update log_std_init=0.0 and refactor

---
 learning/modules/smooth_actor.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index bc262798..91754aeb 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -23,12 +23,12 @@ def __init__(
         use_exp_ln: bool = False,
         learn_features: bool = True,
         epsilon: float = 1e-6,
-        log_std_init: float = -2.0,
+        log_std_init: float = 0.0,
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
         self.full_std = full_std
-        self.use_expln = use_exp_ln
+        self.use_exp_ln = use_exp_ln
         self.learn_features = learn_features
         self.epsilon = epsilon
         self.log_std_init = log_std_init
@@ -54,7 +54,7 @@ def __init__(
         self.distribution = None
 
         # Debug mode for plotting
-        self.debug = True
+        self.debug = False
 
     def sample_weights(self, batch_size=1):
         # Sample weights for the noise exploration matrix
@@ -67,7 +67,7 @@ def sample_weights(self, batch_size=1):
 
     def get_std(self):
         # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally
-        if self.use_expln:
+        if self.use_exp_ln:
             # From gSDE paper, it allows to keep variance
             # above zero and prevent it from growing too fast
             below_threshold = torch.exp(self.log_std) * (self.log_std <= 0)
@@ -83,9 +83,7 @@ def get_std(self):
             return std
         assert self.latent_dim is not None
         # Reduce the number of parameters:
-        return (
-            torch.ones(self.latent_dim, self.num_actions).to(self.log_std.device) * std
-        )
+        return torch.ones(self.latent_dim, 1).to(self.log_std.device) * std
 
     def update_distribution(self, observations):
         if self._normalize_obs:

From a372a5274224c7b0bd18d6232452bb00cbec8778 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 3 Apr 2024 14:01:52 -0400
Subject: [PATCH 09/42] log joint data for training and play

---
 .gitignore                           |   6 +-
 gym/smooth_exploration/plot_play.py  |  38 +++++++++
 gym/smooth_exploration/plot_train.py |  55 +++++++++++++
 learning/runners/on_policy_runner.py |  13 ++-
 plots/plot.py                        |  19 -----
 scripts/log_play.py                  | 117 +++++++++++++++++++++++++++
 scripts/log_train.py                 |  92 +++++++++++++++++++++
 7 files changed, 318 insertions(+), 22 deletions(-)
 create mode 100644 gym/smooth_exploration/plot_play.py
 create mode 100644 gym/smooth_exploration/plot_train.py
 delete mode 100644 plots/plot.py
 create mode 100644 scripts/log_play.py
 create mode 100644 scripts/log_train.py

diff --git a/.gitignore b/.gitignore
index d2761927..20093cee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,8 +16,6 @@ gym/wandb
 *.npz
 user/wandb_config.json
 *trajectories/
-plots/*.csv
-plots/*.png
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -83,3 +81,7 @@ ipython_config.py
 venv/
 env.bak/
 venv.bak/
+
+# Smooth exploration
+gym/smooth_exploration/data
+gym/smooth_exploration/figures
diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py
new file mode 100644
index 00000000..eedee17f
--- /dev/null
+++ b/gym/smooth_exploration/plot_play.py
@@ -0,0 +1,38 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+name = "mini_cheetah_ref"
+data_dir = "./data/play/" + name
+fig_dir = "./figures/play"
+
+# load data
+dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
+dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
+dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
+
+# plot data
+n_steps = 200
+fig, axs = plt.subplots(3, figsize=(10, 10))
+plt.suptitle(name)
+
+for i in range(3):
+    axs[0].plot(dof_pos_obs[:n_steps, i])
+axs[0].set_title("dof_pos_obs")
+axs[0].legend(["idx 0", "idx 1", "idx 2"])
+axs[0].set_xlabel("time steps")
+
+for i in range(3):
+    axs[1].plot(dof_pos_target[:n_steps, i])
+axs[1].set_title("dof_pos_target")
+axs[1].legend(["idx 0", "idx 1", "idx 2"])
+axs[1].set_xlabel("time steps")
+
+for i in range(3):
+    axs[2].plot(dof_vel[:n_steps, i])
+axs[2].set_title("dof_vel")
+axs[2].legend(["idx 0", "idx 1", "idx 2"])
+axs[2].set_xlabel("time steps")
+
+plt.tight_layout()
+plt.savefig(fig_dir + "/" + name + ".png")
+plt.show()
diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py
new file mode 100644
index 00000000..8a45729f
--- /dev/null
+++ b/gym/smooth_exploration/plot_train.py
@@ -0,0 +1,55 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+SMOOTH = True
+name = "ref_sample_8"
+data_dir = "./data/train/" + name
+fig_dir = "./figures/train"
+
+# load data
+dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
+dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
+dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
+torques = np.load(data_dir + "/torques.npy")[0]
+
+iterations = range(0, dof_pos_obs.shape[0], 10)
+
+# plot data for each iteration
+for it in iterations:
+    fig, axs = plt.subplots(4, figsize=(10, 10))
+    plt.suptitle(name + " iteration " + str(it))
+
+    for i in range(3):
+        axs[0].plot(dof_pos_obs[it, :, i])
+        axs[0].set_title("dof_pos_obs")
+        axs[0].legend(["idx 0", "idx 1", "idx 2"])
+        axs[0].set_xlabel("time steps")
+
+    for i in range(3):
+        axs[1].plot(dof_pos_target[it, :, i])
+        axs[1].set_title("dof_pos_target")
+        axs[1].legend(["idx 0", "idx 1", "idx 2"])
+        axs[1].set_xlabel("time steps")
+
+    for i in range(3):
+        axs[2].plot(dof_vel[it, :, i])
+        axs[2].set_title("dof_vel")
+        axs[2].legend(["idx 0", "idx 1", "idx 2"])
+        axs[2].set_xlabel("time steps")
+
+    for i in range(3):
+        axs[3].plot(torques[it, :, i])
+        axs[3].set_title("torques")
+        axs[3].legend(["idx 0", "idx 1", "idx 2"])
+        axs[3].set_xlabel("time steps")
+
+    if SMOOTH:
+        # plot vertical lines where noise is resampled
+        for x in range(0, dof_pos_obs.shape[1], 8):
+            axs[0].axvline(x, color="r", linestyle="--")
+            axs[1].axvline(x, color="r", linestyle="--")
+            axs[2].axvline(x, color="r", linestyle="--")
+            axs[3].axvline(x, color="r", linestyle="--")
+
+    plt.tight_layout()
+    plt.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 9c167cd7..86541f9b 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -18,7 +18,7 @@ def __init__(self, env, train_cfg, device="cpu"):
             self.device,
         )
 
-    def learn(self):
+    def learn(self, states_to_log_dict=None):
         self.set_up_logger()
 
         rewards_dict = {}
@@ -53,6 +53,17 @@ def learn(self):
 
                     self.env.step()
 
+                    # * Log states
+                    # This continuously overwrites the rollout data, so when the state
+                    # dict is written to a file it contains the last rollout for each
+                    # iteration.
+                    it_idx = self.it - 1
+                    if states_to_log_dict is not None:
+                        for state in states_to_log_dict:
+                            states_to_log_dict[state][0, it_idx, i, :] = getattr(
+                                self.env, state
+                            )[0, :]
+
                     actor_obs = self.get_noisy_obs(
                         self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
                     )
diff --git a/plots/plot.py b/plots/plot.py
deleted file mode 100644
index 5018cb4d..00000000
--- a/plots/plot.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pandas as pd
-import matplotlib.pyplot as plt
-
-# Read the CSV file
-name = "distribution_smooth"
-data = pd.read_csv(name + ".csv")
-
-# Plot the data (last n steps)
-n = 200
-plt.plot(data.iloc[-n:, 0])
-plt.plot(data.iloc[-n:, 1])
-plt.xlabel("timestep")
-plt.ylabel("action (NN output)")
-plt.title("gSDE (sample_freq=8)")
-plt.legend(["mean", "sample"])
-# plt.show()
-
-# Save the plot to a file
-plt.savefig(name + ".png")
diff --git a/scripts/log_play.py b/scripts/log_play.py
new file mode 100644
index 00000000..adc1b598
--- /dev/null
+++ b/scripts/log_play.py
@@ -0,0 +1,117 @@
+from gym.envs import __init__  # noqa: F401
+from gym.utils import get_args, task_registry
+from gym.utils import KeyboardInterface
+from gym.utils import VisualizationRecorder
+from gym import LEGGED_GYM_ROOT_DIR
+
+# torch needs to be imported after isaacgym imports in local source
+import torch
+import os
+import numpy as np
+
+TEST_TOTAL_TIMESTEPS = 1000
+
+
+def create_logging_dict(runner, test_total_timesteps):
+    states_to_log = [
+        "dof_pos_target",
+        "dof_pos_obs",
+        "dof_vel",
+        "torques",
+        "commands",
+        # 'base_lin_vel',
+        # 'base_ang_vel',
+        # 'oscillators',
+        # 'grf',
+        # 'base_height'
+    ]
+
+    states_to_log_dict = {}
+
+    for state in states_to_log:
+        array_dim = runner.get_obs_size(
+            [
+                state,
+            ]
+        )
+        states_to_log_dict[state] = torch.zeros(
+            (1, test_total_timesteps, array_dim), device=runner.env.device
+        )
+    return states_to_log_dict
+
+
+def setup(args):
+    env_cfg, train_cfg = task_registry.create_cfgs(args)
+    env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16)
+    if hasattr(env_cfg, "push_robots"):
+        env_cfg.push_robots.toggle = False
+    if hasattr(env_cfg, "commands"):
+        env_cfg.commands.resampling_time = 9999
+    env_cfg.env.episode_length_s = 9999
+    env_cfg.env.num_projectiles = 20
+    task_registry.make_gym_and_sim()
+    env = task_registry.make_env(args.task, env_cfg)
+    env.cfg.init_state.reset_mode = "reset_to_basic"
+    train_cfg.runner.resume = True
+    train_cfg.logging.enable_local_saving = False
+    runner = task_registry.make_alg_runner(env, train_cfg)
+
+    # * switch to evaluation mode (dropout for example)
+    runner.switch_to_eval()
+    return env, runner, train_cfg
+
+
+def play(env, runner, train_cfg):
+    protocol_name = train_cfg.runner.experiment_name
+
+    # * set up logging
+    log_file_path = os.path.join(
+        LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "play", protocol_name + ".npz"
+    )
+    if not os.path.exists(os.path.dirname(log_file_path)):
+        os.makedirs(os.path.dirname(log_file_path))
+
+    states_to_log_dict = create_logging_dict(runner, TEST_TOTAL_TIMESTEPS)
+
+    # * set up recording
+    if env.cfg.viewer.record:
+        recorder = VisualizationRecorder(
+            env, train_cfg.runner.experiment_name, train_cfg.runner.load_run
+        )
+
+    # * set up interface: GamepadInterface(env) or KeyboardInterface(env)
+    COMMANDS_INTERFACE = hasattr(env, "commands")
+    if COMMANDS_INTERFACE:
+        # interface = GamepadInterface(env)
+        interface = KeyboardInterface(env)
+
+    for t in range(TEST_TOTAL_TIMESTEPS):
+        if COMMANDS_INTERFACE:
+            interface.update(env)
+        if env.cfg.viewer.record:
+            recorder.update(t)
+        runner.set_actions(
+            runner.policy_cfg["actions"],
+            runner.get_inference_actions(),
+            runner.policy_cfg["disable_actions"],
+        )
+        env.step()
+        env.check_exit()
+
+        # * log
+        for state in states_to_log_dict:
+            states_to_log_dict[state][:, t, :] = getattr(env, state)[0, :]
+
+    # * save data
+    # first convert tensors to cpu
+    log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()}
+    np.savez_compressed(log_file_path, **log_dict_cpu)
+    print("saved to ", log_file_path)
+    return states_to_log_dict
+
+
+if __name__ == "__main__":
+    args = get_args()
+    with torch.no_grad():
+        env, runner, train_cfg = setup(args)
+        play(env, runner, train_cfg)
diff --git a/scripts/log_train.py b/scripts/log_train.py
new file mode 100644
index 00000000..918d0b8f
--- /dev/null
+++ b/scripts/log_train.py
@@ -0,0 +1,92 @@
+from gym.envs import __init__  # noqa: F401
+from gym.utils import get_args, task_registry, randomize_episode_counters
+from gym.utils.logging_and_saving import wandb_singleton
+from gym.utils.logging_and_saving import local_code_save_helper
+from gym import LEGGED_GYM_ROOT_DIR
+
+# torch needs to be imported after isaacgym imports in local source
+import torch
+import os
+import numpy as np
+
+TRAIN_ITERATIONS = 100
+ROLLOUT_TIMESTEPS = 32
+
+
+def create_logging_dict(runner):
+    states_to_log = [
+        "dof_pos_target",
+        "dof_pos_obs",
+        "dof_vel",
+        "torques",
+        "commands",
+        # 'base_lin_vel',
+        # 'base_ang_vel',
+        # 'oscillators',
+        # 'grf',
+        # 'base_height'
+    ]
+
+    states_to_log_dict = {}
+
+    for state in states_to_log:
+        array_dim = runner.get_obs_size(
+            [
+                state,
+            ]
+        )
+        states_to_log_dict[state] = torch.zeros(
+            (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim),
+            device=runner.env.device,
+        )
+    return states_to_log_dict
+
+
+def setup():
+    args = get_args()
+    wandb_helper = wandb_singleton.WandbSingleton()
+
+    env_cfg, train_cfg = task_registry.create_cfgs(args)
+    task_registry.make_gym_and_sim()
+    wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args)
+    env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
+    randomize_episode_counters(env)
+
+    train_cfg.runner.max_iterations = TRAIN_ITERATIONS
+    policy_runner = task_registry.make_alg_runner(env, train_cfg)
+
+    local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir)
+
+    return train_cfg, policy_runner
+
+
+def train(train_cfg, policy_runner):
+    protocol_name = train_cfg.runner.experiment_name
+
+    # * set up logging
+    log_file_path = os.path.join(
+        LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "train", protocol_name + ".npz"
+    )
+    if not os.path.exists(os.path.dirname(log_file_path)):
+        os.makedirs(os.path.dirname(log_file_path))
+
+    states_to_log_dict = create_logging_dict(policy_runner)
+
+    # * train
+    # wandb_helper = wandb_singleton.WandbSingleton()
+
+    policy_runner.learn(states_to_log_dict)
+
+    # wandb_helper.close_wandb()
+
+    # * save data
+    # first convert tensors to cpu
+    log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()}
+    np.savez_compressed(log_file_path, **log_dict_cpu)
+    print("saved to ", log_file_path)
+    return states_to_log_dict
+
+
+if __name__ == "__main__":
+    train_cfg, policy_runner = setup()
+    train(train_cfg=train_cfg, policy_runner=policy_runner)

From 599b2cab114cbcbae72ead09ed682b4fa1b92ff7 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Sat, 6 Apr 2024 18:05:29 -0400
Subject: [PATCH 10/42] update logging and plotting

---
 .gitignore                           |  4 +-
 gym/smooth_exploration/plot_play.py  |  4 +-
 gym/smooth_exploration/plot_train.py | 86 ++++++++++++++++++++--------
 learning/runners/on_policy_runner.py | 57 ++++++++++++++----
 scripts/log_play.py                  |  6 +-
 scripts/log_train.py                 | 13 ++++-
 6 files changed, 128 insertions(+), 42 deletions(-)

diff --git a/.gitignore b/.gitignore
index 20093cee..c78e6696 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,5 +83,5 @@ env.bak/
 venv.bak/
 
 # Smooth exploration
-gym/smooth_exploration/data
-gym/smooth_exploration/figures
+gym/smooth_exploration/data*
+gym/smooth_exploration/figures*
diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py
index eedee17f..5768f91a 100644
--- a/gym/smooth_exploration/plot_play.py
+++ b/gym/smooth_exploration/plot_play.py
@@ -2,8 +2,8 @@
 import matplotlib.pyplot as plt
 
 name = "mini_cheetah_ref"
-data_dir = "./data/play/" + name
-fig_dir = "./figures/play"
+data_dir = "./data_play/" + name
+fig_dir = "./figures_play/" + name
 
 # load data
 dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py
index 8a45729f..58c50396 100644
--- a/gym/smooth_exploration/plot_train.py
+++ b/gym/smooth_exploration/plot_train.py
@@ -1,55 +1,93 @@
 import numpy as np
 import matplotlib.pyplot as plt
+import os
 
+FOURIER = True
 SMOOTH = True
-name = "ref_sample_8"
-data_dir = "./data/train/" + name
-fig_dir = "./figures/train"
+SAMPLE_FREQ = 16
+STEPS = 1000
+
+name = "ref_sample_16_len_1000"
+data_dir = "./data_train/" + name
+fig_dir = "./figures_train/" + name
+
+if not os.path.exists(fig_dir):
+    os.makedirs(fig_dir)
 
 # load data
 dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
 dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
 dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
 torques = np.load(data_dir + "/torques.npy")[0]
+terminated = np.load(data_dir + "/terminated.npy")[0]
+
+
+# plot fourier trainsform
+def plot_fourier(data, it):
+    fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10))
+    for i in range(3):
+        ft = np.fft.fft(data[:, i])
+        ft_half = ft[: len(ft) // 2]
+        axs_ft[0].plot(np.abs(ft_half))
+        axs_ft[1].plot(np.angle(ft_half))
+
+    axs_ft[0].set_title("FT Amplitude")
+    axs_ft[0].set_xlabel("Frequency")
+    axs_ft[0].set_ylabel("Amplitude")
+    axs_ft[0].legend(["idx 0", "idx 1", "idx 2"])
+    axs_ft[1].set_title("FT Phase")
+    axs_ft[1].set_xlabel("Frequency")
+    axs_ft[1].set_ylabel("Phase")
+    axs_ft[1].legend(["idx 0", "idx 1", "idx 2"])
+
+    fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png")
 
-iterations = range(0, dof_pos_obs.shape[0], 10)
 
 # plot data for each iteration
-for it in iterations:
+for it in range(0, dof_pos_obs.shape[0], 10):
+    # check if iteration terminated
+    terminate_idx = np.where(terminated[it, :, 0] == 1)[0]
+    if terminate_idx.size > 0:
+        n_steps = terminate_idx[0]
+    else:
+        n_steps = dof_pos_obs.shape[1]
+    print(n_steps)
+
+    # generate figure
     fig, axs = plt.subplots(4, figsize=(10, 10))
     plt.suptitle(name + " iteration " + str(it))
 
+    axs[0].set_title("dof_pos_obs")
     for i in range(3):
-        axs[0].plot(dof_pos_obs[it, :, i])
-        axs[0].set_title("dof_pos_obs")
-        axs[0].legend(["idx 0", "idx 1", "idx 2"])
-        axs[0].set_xlabel("time steps")
+        axs[0].plot(dof_pos_obs[it, :n_steps, i])
 
+    axs[1].set_title("dof_pos_target")
     for i in range(3):
-        axs[1].plot(dof_pos_target[it, :, i])
-        axs[1].set_title("dof_pos_target")
-        axs[1].legend(["idx 0", "idx 1", "idx 2"])
-        axs[1].set_xlabel("time steps")
+        axs[1].plot(dof_pos_target[it, :n_steps, i])
+    if FOURIER and n_steps == STEPS:
+        plot_fourier(dof_pos_target[it, :n_steps, :], it)
 
+    axs[2].set_title("dof_vel")
     for i in range(3):
-        axs[2].plot(dof_vel[it, :, i])
-        axs[2].set_title("dof_vel")
-        axs[2].legend(["idx 0", "idx 1", "idx 2"])
-        axs[2].set_xlabel("time steps")
+        axs[2].plot(dof_vel[it, :n_steps, i])
 
+    axs[3].set_title("torques")
     for i in range(3):
-        axs[3].plot(torques[it, :, i])
-        axs[3].set_title("torques")
-        axs[3].legend(["idx 0", "idx 1", "idx 2"])
-        axs[3].set_xlabel("time steps")
+        axs[3].plot(torques[it, :n_steps, i])
+
+    # format plots
+    for idx in range(4):
+        axs[idx].legend(["idx 0", "idx 1", "idx 2"])
+        axs[idx].set_xlabel("time steps")
+        axs[idx].set_xlim([0, n_steps])
 
     if SMOOTH:
         # plot vertical lines where noise is resampled
-        for x in range(0, dof_pos_obs.shape[1], 8):
+        for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ):
             axs[0].axvline(x, color="r", linestyle="--")
             axs[1].axvline(x, color="r", linestyle="--")
             axs[2].axvline(x, color="r", linestyle="--")
             axs[3].axvline(x, color="r", linestyle="--")
 
-    plt.tight_layout()
-    plt.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
+    fig.tight_layout()
+    fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 86541f9b..0729b994 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -34,6 +34,13 @@ def learn(self, states_to_log_dict=None):
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
+
+            # * Simulate environment and log states
+            if states_to_log_dict is not None:
+                it_idx = self.it - 1
+                if it_idx % 10 == 0:
+                    self.sim_and_log_states(states_to_log_dict, it_idx)
+
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
@@ -53,17 +60,6 @@ def learn(self, states_to_log_dict=None):
 
                     self.env.step()
 
-                    # * Log states
-                    # This continuously overwrites the rollout data, so when the state
-                    # dict is written to a file it contains the last rollout for each
-                    # iteration.
-                    it_idx = self.it - 1
-                    if states_to_log_dict is not None:
-                        for state in states_to_log_dict:
-                            states_to_log_dict[state][0, it_idx, i, :] = getattr(
-                                self.env, state
-                            )[0, :]
-
                     actor_obs = self.get_noisy_obs(
                         self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
                     )
@@ -171,3 +167,42 @@ def init_storage(self):
             critic_obs_shape=[num_critic_obs],
             action_shape=[num_actions],
         )
+
+    def sim_and_log_states(self, states_to_log_dict, it_idx):
+        # Simulate environment for as many steps as expected in the dict.
+        # Log states to the dict, as well as whether the env terminated.
+        steps = states_to_log_dict["terminated"].shape[2]
+        actor_obs = self.get_obs(self.policy_cfg["actor_obs"])
+        critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
+
+        with torch.inference_mode():
+            for i in range(steps):
+                sample_freq = self.policy_cfg["exploration_sample_freq"]
+                if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0:
+                    self.alg.actor_critic.actor.sample_weights(
+                        batch_size=self.env.num_envs
+                    )
+
+                actions = self.alg.act(actor_obs, critic_obs)
+                self.set_actions(
+                    self.policy_cfg["actions"],
+                    actions,
+                    self.policy_cfg["disable_actions"],
+                )
+
+                self.env.step()
+
+                actor_obs = self.get_noisy_obs(
+                    self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
+                )
+                critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
+
+                # Log states (just for the first env)
+                terminated = self.get_terminated()[0]
+                for state in states_to_log_dict:
+                    if state == "terminated":
+                        states_to_log_dict[state][0, it_idx, i, :] = terminated
+                    else:
+                        states_to_log_dict[state][0, it_idx, i, :] = getattr(
+                            self.env, state
+                        )[0, :]
diff --git a/scripts/log_play.py b/scripts/log_play.py
index adc1b598..f7fd68e8 100644
--- a/scripts/log_play.py
+++ b/scripts/log_play.py
@@ -66,7 +66,11 @@ def play(env, runner, train_cfg):
 
     # * set up logging
     log_file_path = os.path.join(
-        LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "play", protocol_name + ".npz"
+        LEGGED_GYM_ROOT_DIR,
+        "gym",
+        "smooth_exploration",
+        "data_play",
+        protocol_name + ".npz",
     )
     if not os.path.exists(os.path.dirname(log_file_path)):
         os.makedirs(os.path.dirname(log_file_path))
diff --git a/scripts/log_train.py b/scripts/log_train.py
index 918d0b8f..37fca0da 100644
--- a/scripts/log_train.py
+++ b/scripts/log_train.py
@@ -10,7 +10,7 @@
 import numpy as np
 
 TRAIN_ITERATIONS = 100
-ROLLOUT_TIMESTEPS = 32
+ROLLOUT_TIMESTEPS = 1000
 
 
 def create_logging_dict(runner):
@@ -39,6 +39,11 @@ def create_logging_dict(runner):
             (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim),
             device=runner.env.device,
         )
+
+    # log whether rollout terminated
+    states_to_log_dict["terminated"] = torch.zeros(
+        (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, 1), device=runner.env.device
+    )
     return states_to_log_dict
 
 
@@ -65,7 +70,11 @@ def train(train_cfg, policy_runner):
 
     # * set up logging
     log_file_path = os.path.join(
-        LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "train", protocol_name + ".npz"
+        LEGGED_GYM_ROOT_DIR,
+        "gym",
+        "smooth_exploration",
+        "data_train",
+        protocol_name + ".npz",
     )
     if not os.path.exists(os.path.dirname(log_file_path)):
         os.makedirs(os.path.dirname(log_file_path))

From 4994550b747b289011c91eddcfd40d4d2f59acff Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 19 Apr 2024 16:20:08 -0400
Subject: [PATCH 11/42] plot FT script

---
 gym/smooth_exploration/plot_ft.py | 56 +++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 gym/smooth_exploration/plot_ft.py

diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py
new file mode 100644
index 00000000..5ad86b63
--- /dev/null
+++ b/gym/smooth_exploration/plot_ft.py
@@ -0,0 +1,56 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+SAMPLE_FREQ = 8
+STEPS = 500
+
+smooth_name = "ref_sample_8_len_500"
+baseline_name = "ref_baseline_len_500"
+
+smooth_data_dir = "./data_train/" + smooth_name
+baseline_data_dir = "./data_train/" + baseline_name
+fig_dir = "./figures_train/"
+
+if not os.path.exists(fig_dir):
+    os.makedirs(fig_dir)
+
+# load data
+smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0]
+baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0]
+smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0]
+baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0]
+
+# compute FFT averages
+smooth_ffts = [[], [], []]
+baseline_ffts = [[], [], []]
+for it in range(0, smooth_pos_target.shape[0], 10):
+    # only use data that didn't terminate
+    if not np.any(smooth_terminated[it, :, 0]):
+        for idx in range(3):
+            fft = np.fft.fft(smooth_pos_target[it, :, idx])
+            smooth_ffts[idx].append(fft[: len(fft) // 2])
+
+    if not np.any(baseline_terminated[it, :, 0]):
+        for idx in range(3):
+            fft = np.fft.fft(baseline_pos_target[it, :, idx])
+            baseline_ffts[idx].append(fft[: len(fft) // 2])
+
+print(f"Total smooth FFTS: {len(smooth_ffts[0])}")
+print(f"Total baseline FFTS: {len(baseline_ffts[0])}")
+
+smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)]
+baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)]
+
+# plot FFTs
+fig, axs = plt.subplots(3, 1, figsize=(10, 10))
+for idx in range(3):
+    axs[idx].plot(np.abs(smooth_fft_means[idx]))
+    axs[idx].plot(np.abs(baseline_fft_means[idx]))
+    axs[idx].set_title(f"FT Amplitude idx {idx}")
+    axs[idx].set_xlabel("Frequency")
+    axs[idx].set_ylabel("Amplitude")
+    axs[idx].legend(["smooth", "baseline"])
+
+fig.tight_layout()
+fig.savefig(fig_dir + "/" + smooth_name + ".png")

From 0205d99b33fc338c6ed3644ef55638fcc6bb1b5e Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 19 Apr 2024 18:47:36 -0400
Subject: [PATCH 12/42] added sweep config

---
 .../sweep_config_smooth_exploration.json      | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 scripts/sweep_configs/sweep_config_smooth_exploration.json

diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json
new file mode 100644
index 00000000..952282fe
--- /dev/null
+++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json
@@ -0,0 +1,19 @@
+{
+    "method": "grid",
+    "metric": {
+        "name": "rewards/total_rewards",
+        "goal": "maximize"
+    },
+    "run_cap": 10,
+    "parameters": {
+        "train_cfg.policy.exploration_sample_freq":{
+            "values": [4, 8, 16, 32]
+        },
+        "train_cfg.runner.num_steps_per_env":{
+            "values": [32, 64]
+        },
+        "train_cfg.runner.max_iterations":{
+            "values": [800]
+        }
+    }
+}
\ No newline at end of file

From 64b9b1446f4d63ca150db859d889b6d643a8b296 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 24 Apr 2024 13:56:26 -0400
Subject: [PATCH 13/42] update on policy and old policy runners, get nans for
 log_probs

---
 learning/algorithms/ppo2.py           |  9 +++++++--
 learning/modules/smooth_actor.py      | 25 ++++++++++++++-----------
 learning/runners/BaseRunner.py        |  7 +++++--
 learning/runners/old_policy_runner.py | 18 ++++++++++++++++--
 learning/runners/on_policy_runner.py  | 12 +++++++-----
 5 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index 5866052c..b6b4fec0 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -6,6 +6,7 @@
     create_uniform_generator,
     compute_generalized_advantages,
 )
+from learning.modules import SmoothActor
 
 
 class PPO2:
@@ -99,7 +100,7 @@ def update_actor(self, data):
         self.mean_surrogate_loss = 0
         counter = 0
 
-        self.actor.act(data["actor_obs"])
+        self.actor.update_distribution(data["actor_obs"])
         data["old_sigma_batch"] = self.actor.action_std.detach()
         data["old_mu_batch"] = self.actor.action_mean.detach()
         data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob(
@@ -111,8 +112,12 @@ def update_actor(self, data):
         batch_size = total_data // self.num_mini_batches
         generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
         for batch in generator:
+            # * Re-sample noise for smooth actor
+            if isinstance(self.actor, SmoothActor):
+                self.actor.sample_weights(batch_size)
+
             # ! refactor how this is done
-            self.actor.act(batch["actor_obs"])
+            self.actor.update_distribution(batch["actor_obs"])
             actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"])
             mu_batch = self.actor.action_mean
             sigma_batch = self.actor.action_std
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index 91754aeb..dae53c9e 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -13,14 +13,13 @@
 class SmoothActor(Actor):
     weights_dist: Normal
     _latent_sde: torch.Tensor
-    exploration_mat: torch.Tensor
     exploration_matrices: torch.Tensor
 
     def __init__(
         self,
         *args,
         full_std: bool = True,
-        use_exp_ln: bool = False,
+        use_exp_ln: bool = True,
         learn_features: bool = True,
         epsilon: float = 1e-6,
         log_std_init: float = 0.0,
@@ -60,8 +59,6 @@ def sample_weights(self, batch_size=1):
         # Sample weights for the noise exploration matrix
         std = self.get_std()
         self.weights_dist = Normal(torch.zeros_like(std), std)
-        # Reparametrization trick to pass gradients
-        self.exploration_mat = self.weights_dist.rsample()
         # Pre-compute matrices in case of parallel exploration
         self.exploration_matrices = self.weights_dist.rsample((batch_size,))
 
@@ -87,12 +84,20 @@ def get_std(self):
 
     def update_distribution(self, observations):
         if self._normalize_obs:
-            observations = self.normalize(observations)
+            with torch.no_grad():
+                observations = self.obs_rms(observations)
         # Get latent features and compute distribution
         self._latent_sde = self.latent_net(observations)
         if not self.learn_features:
             self._latent_sde = self._latent_sde.detach()
-        variance = torch.mm(self._latent_sde**2, self.get_std() ** 2)
+        if self._latent_sde.dim() == 2:
+            variance = torch.mm(self._latent_sde**2, self.get_std() ** 2)
+        elif self._latent_sde.dim() == 3:
+            variance = torch.einsum(
+                "abc,cd->abd", self._latent_sde**2, self.get_std() ** 2
+            )
+        else:
+            raise ValueError("Invalid latent_sde dimension")
         mean_actions = self.mean_actions_net(self._latent_sde)
         self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
 
@@ -107,7 +112,8 @@ def act(self, observations):
 
     def act_inference(self, observations):
         if self._normalize_obs:
-            observations = self.normalize(observations)
+            with torch.no_grad():
+                observations = self.obs_rms(observations)
         latent_sde = self.latent_net(observations)
         mean_actions = self.mean_actions_net(latent_sde)
         return mean_actions
@@ -116,12 +122,9 @@ def get_noise(self):
         latent_sde = self._latent_sde
         if not self.learn_features:
             latent_sde = latent_sde.detach()
-        # Default case: only one exploration matrix
-        if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices):
-            return torch.mm(latent_sde, self.exploration_mat.to(latent_sde.device))
         # Use batch matrix multiplication for efficient computation
         # (batch_size, n_features) -> (batch_size, 1, n_features)
         latent_sde = latent_sde.unsqueeze(dim=1)
         # (batch_size, 1, n_actions)
-        noise = torch.bmm(latent_sde, self.exploration_matrices)
+        noise = torch.bmm(latent_sde, self.exploration_matrices.to(latent_sde.device))
         return noise.squeeze(dim=1)
diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py
index 9e5ab2e3..39ce336d 100644
--- a/learning/runners/BaseRunner.py
+++ b/learning/runners/BaseRunner.py
@@ -1,6 +1,6 @@
 import torch
 from learning.algorithms import *  # noqa: F403
-from learning.modules import Actor, Critic
+from learning.modules import Actor, Critic, SmoothActor
 from learning.utils import remove_zero_weighted_rewards
 
 
@@ -22,7 +22,10 @@ def _set_up_alg(self):
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
         num_actions = self.get_action_size(self.actor_cfg["actions"])
         num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
-        actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        if self.actor_cfg["smooth_exploration"]:
+            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+        else:
+            actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
         critic = Critic(num_critic_obs, **self.critic_cfg)
         alg_class = eval(self.cfg["algorithm_class_name"])
         self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py
index 42f019a3..8b58dbe4 100644
--- a/learning/runners/old_policy_runner.py
+++ b/learning/runners/old_policy_runner.py
@@ -4,7 +4,7 @@
 from learning.utils import Logger
 from .BaseRunner import BaseRunner
 from learning.algorithms import PPO  # noqa: F401
-from learning.modules import ActorCritic, Actor, Critic
+from learning.modules import ActorCritic, Actor, Critic, SmoothActor
 
 logger = Logger()
 
@@ -24,7 +24,10 @@ def _set_up_alg(self):
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
         num_actions = self.get_action_size(self.actor_cfg["actions"])
         num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
-        actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        if self.actor_cfg["smooth_exploration"]:
+            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+        else:
+            actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
         critic = Critic(num_critic_obs, **self.critic_cfg)
         actor_critic = ActorCritic(actor, critic)
         alg_class = eval(self.cfg["algorithm_class_name"])
@@ -42,6 +45,10 @@ def learn(self):
 
         self.save()
 
+        # * Initialize smooth exploration matrices
+        if self.actor_cfg["smooth_exploration"]:
+            self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs)
+
         logger.tic("runtime")
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
@@ -49,6 +56,13 @@ def learn(self):
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
+                    # * Re-sample noise matrix for smooth exploration
+                    sample_freq = self.actor_cfg["exploration_sample_freq"]
+                    if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
+                        self.alg.actor_critic.actor.sample_weights(
+                            batch_size=self.env.num_envs
+                        )
+
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.actor_cfg["actions"],
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 3d38b48a..79f086d7 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -32,6 +32,10 @@ def learn(self, states_to_log_dict=None):
         tot_iter = self.it + self.num_learning_iterations
         self.save()
 
+        # * Initialize smooth exploration matrices
+        if self.actor_cfg["smooth_exploration"]:
+            self.alg.actor.sample_weights(batch_size=self.env.num_envs)
+
         # * start up storage
         transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
         transition.update(
@@ -65,11 +69,9 @@ def learn(self, states_to_log_dict=None):
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
                     # * Re-sample noise matrix for smooth exploration
-                    sample_freq = self.policy_cfg["exploration_sample_freq"]
-                    if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0:
-                        self.alg.actor_critic.actor.sample_weights(
-                            batch_size=self.env.num_envs
-                        )
+                    sample_freq = self.actor_cfg["exploration_sample_freq"]
+                    if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
+                        self.alg.actor.sample_weights(batch_size=self.env.num_envs)
 
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(

From 91656e9b81d4a1722100d100337c63dd31d0cd56 Mon Sep 17 00:00:00 2001
From: Steve Heim <heim.steve@gmail.com>
Date: Thu, 25 Apr 2024 20:08:52 -0400
Subject: [PATCH 14/42] run 200 iterations before starting training, to burn in
 normalization

---
 learning/runners/on_policy_runner.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 79f086d7..fc754e17 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -54,6 +54,20 @@ def learn(self, states_to_log_dict=None):
             device=self.device,
         )
 
+        # burn in observation normalization.
+        with torch.inference_mode():
+            for _ in range(200):
+                actions = self.alg.act(actor_obs, critic_obs)
+                self.set_actions(
+                    self.actor_cfg["actions"],
+                    actions,
+                    self.actor_cfg["disable_actions"],
+                )
+                self.env.step()
+                actor_obs = self.get_noisy_obs(
+                    self.actor_cfg["obs"], self.actor_cfg["noise"]
+                )
+
         logger.tic("runtime")
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")

From f74e7feaa103c7ace373c55b9e95db0d0a21f38f Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 3 May 2024 14:37:30 -0400
Subject: [PATCH 15/42] update dummy input in export_network

---
 learning/modules/utils/neural_net.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 057e724f..c28762be 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -80,7 +80,7 @@ def export_network(network, network_name, path, num_inputs):
     model = copy.deepcopy(network).to("cpu")
     # To trace model, must be evaluated once with arbitrary input
     model.eval()
-    dummy_input = torch.rand((1, num_inputs))
+    dummy_input = torch.rand((num_inputs))
     model_traced = torch.jit.trace(model, dummy_input)
     torch.jit.save(model_traced, path_TS)
     torch.onnx.export(model_traced, dummy_input, path_onnx)

From 18045e9289dec5f6d8c3cc74cacaf6b873489fe3 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 3 May 2024 18:50:12 -0400
Subject: [PATCH 16/42] good choice of params: sample 16, rollout 32, LR x1.1,
 des_kl 0.02

---
 gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 8 ++++----
 learning/algorithms/ppo2.py                      | 4 ++--
 learning/runners/on_policy_runner.py             | 4 +++-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 2e108f16..a6f12e18 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -74,7 +74,7 @@ class actor:
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
         smooth_exploration = True
-        exploration_sample_freq = 8
+        exploration_sample_freq = 16
 
         normalize_obs = True
         obs = [
@@ -151,17 +151,17 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         num_mini_batches = 4
         storage_size = 2**17  # new
         mini_batch_size = 2**15  #  new
-        learning_rate = 5.0e-5
+        learning_rate = 0.002
         schedule = "adaptive"  # can be adaptive, fixed
         discount_horizon = 1.0  # [s]
         lam = 0.95
         GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.01
+        desired_kl = 0.02
         max_grad_norm = 1.0
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
-        max_iterations = 500  # number of policy updates
+        max_iterations = 800  # number of policy updates
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index b6b4fec0..58945bb3 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -139,9 +139,9 @@ def update_actor(self, data):
                     kl_mean = torch.mean(kl)
 
                     if kl_mean > self.desired_kl * 2.0:
-                        self.learning_rate = max(1e-5, self.learning_rate / 1.5)
+                        self.learning_rate = max(2e-4, self.learning_rate / 1.1)
                     elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
-                        self.learning_rate = min(1e-2, self.learning_rate * 1.5)
+                        self.learning_rate = min(1e-2, self.learning_rate * 1.1)
 
                     for param_group in self.optimizer.param_groups:
                         # ! check this
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index fc754e17..1e885e81 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -167,7 +167,9 @@ def set_up_logger(self):
         )
         logger.register_rewards(["total_rewards"])
         logger.register_category(
-            "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"]
+            "algorithm",
+            self.alg,
+            ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
         )
         logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
 

From adfb0788fb61883f20c15dbacc93ec9010b471ac Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Sun, 5 May 2024 22:48:28 -0400
Subject: [PATCH 17/42] export latent network and update configs

---
 gym/envs/mini_cheetah/mini_cheetah_config.py  | 24 +++++++++++--------
 .../mini_cheetah/mini_cheetah_ref_config.py   |  7 ++++--
 learning/algorithms/ppo2.py                   | 13 ++++++++--
 learning/modules/utils/neural_net.py          | 10 +++++++-
 .../sweep_config_smooth_exploration.json      |  3 ---
 5 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
index 104e5f64..6631704c 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -130,16 +130,17 @@ class actor:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-        smooth_exploration = False
+        smooth_exploration = True
+        exploration_sample_freq = 16
 
         obs = [
-            "base_lin_vel",
+            # "base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
             "dof_pos_obs",
             "dof_vel",
-            "dof_pos_target",
+            # "dof_pos_target",
         ]
         actions = ["dof_pos_target"]
         add_noise = True
@@ -195,20 +196,23 @@ class algorithm(LeggedRobotRunnerCfg.algorithm):
         value_loss_coef = 1.0
         use_clipped_value_loss = True
         clip_param = 0.2
-        entropy_coef = 0.02
-        num_learning_epochs = 4
+        entropy_coef = 0.01
+        num_learning_epochs = 6
         # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 8
-        learning_rate = 1.0e-5
-        schedule = "adaptive"  # can be adaptive or fixed
+        num_mini_batches = 4
         discount_horizon = 1.0  # [s]
         # GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.01
+        desired_kl = 0.02
         max_grad_norm = 1.0
+        # * Learning rate
+        learning_rate = 0.002
+        schedule = "adaptive"  # can be adaptive or fixed
+        lr_range = [2e-4, 1e-2]
+        lr_ratio = 1.3
 
     class runner(LeggedRobotRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah"
-        max_iterations = 500
+        max_iterations = 800
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index a6f12e18..4f50f7d2 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -151,13 +151,16 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         num_mini_batches = 4
         storage_size = 2**17  # new
         mini_batch_size = 2**15  #  new
-        learning_rate = 0.002
-        schedule = "adaptive"  # can be adaptive, fixed
         discount_horizon = 1.0  # [s]
         lam = 0.95
         GAE_bootstrap_horizon = 2.0  # [s]
         desired_kl = 0.02
         max_grad_norm = 1.0
+        # * Learning rate
+        learning_rate = 0.002
+        schedule = "adaptive"  # can be adaptive or fixed
+        lr_range = [3e-4, 1e-2]
+        lr_ratio = 1.3
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index 58945bb3..528ef882 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -27,6 +27,8 @@ def __init__(
         desired_kl=0.01,
         loss_fn="MSE",
         device="cpu",
+        lr_range=[1e-4, 1e-2],
+        lr_ratio=1.3,
         **kwargs,
     ):
         self.device = device
@@ -34,6 +36,8 @@ def __init__(
         self.desired_kl = desired_kl
         self.schedule = schedule
         self.learning_rate = learning_rate
+        self.lr_range = lr_range
+        self.lr_ratio = lr_ratio
 
         # * PPO components
         self.actor = actor.to(self.device)
@@ -137,11 +141,16 @@ def update_actor(self, data):
                         axis=-1,
                     )
                     kl_mean = torch.mean(kl)
+                    lr_min, lr_max = self.lr_range
 
                     if kl_mean > self.desired_kl * 2.0:
-                        self.learning_rate = max(2e-4, self.learning_rate / 1.1)
+                        self.learning_rate = max(
+                            lr_min, self.learning_rate / self.lr_ratio
+                        )
                     elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
-                        self.learning_rate = min(1e-2, self.learning_rate * 1.1)
+                        self.learning_rate = min(
+                            lr_max, self.learning_rate * self.lr_ratio
+                        )
 
                     for param_group in self.optimizer.param_groups:
                         # ! check this
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index c28762be..24ed3df7 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -62,7 +62,7 @@ def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0):
         layer_list.append(activation)
 
 
-def export_network(network, network_name, path, num_inputs):
+def export_network(network, network_name, path, num_inputs, latent=True):
     """
     Thsi function traces and exports the given network module in .pt and
     .onnx file formats. These can be used for evaluation on other systems
@@ -84,3 +84,11 @@ def export_network(network, network_name, path, num_inputs):
     model_traced = torch.jit.trace(model, dummy_input)
     torch.jit.save(model_traced, path_TS)
     torch.onnx.export(model_traced, dummy_input, path_onnx)
+
+    if latent:
+        path_latent = os.path.join(path, network_name + "_latent.onnx")
+        model_latent = model.latent_net
+        model_latent.eval()
+        dummy_input = torch.rand((num_inputs))
+        model_traced = torch.jit.trace(model_latent, dummy_input)
+        torch.onnx.export(model_traced, dummy_input, path_latent)
diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json
index 952282fe..49fc4125 100644
--- a/scripts/sweep_configs/sweep_config_smooth_exploration.json
+++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json
@@ -11,9 +11,6 @@
         },
         "train_cfg.runner.num_steps_per_env":{
             "values": [32, 64]
-        },
-        "train_cfg.runner.max_iterations":{
-            "values": [800]
         }
     }
 }
\ No newline at end of file

From 8ff82ae26b1e73daf0f72dc76af7756aab12de83 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 6 May 2024 11:13:00 -0400
Subject: [PATCH 18/42] export latent net with norm and log get_std

---
 learning/modules/smooth_actor.py     | 9 ++++-----
 learning/modules/utils/neural_net.py | 2 +-
 learning/runners/on_policy_runner.py | 4 +++-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index dae53c9e..5cb1da89 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -57,11 +57,12 @@ def __init__(
 
     def sample_weights(self, batch_size=1):
         # Sample weights for the noise exploration matrix
-        std = self.get_std()
+        std = self.get_std
         self.weights_dist = Normal(torch.zeros_like(std), std)
         # Pre-compute matrices in case of parallel exploration
         self.exploration_matrices = self.weights_dist.rsample((batch_size,))
 
+    @property
     def get_std(self):
         # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally
         if self.use_exp_ln:
@@ -91,11 +92,9 @@ def update_distribution(self, observations):
         if not self.learn_features:
             self._latent_sde = self._latent_sde.detach()
         if self._latent_sde.dim() == 2:
-            variance = torch.mm(self._latent_sde**2, self.get_std() ** 2)
+            variance = torch.mm(self._latent_sde**2, self.get_std**2)
         elif self._latent_sde.dim() == 3:
-            variance = torch.einsum(
-                "abc,cd->abd", self._latent_sde**2, self.get_std() ** 2
-            )
+            variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2)
         else:
             raise ValueError("Invalid latent_sde dimension")
         mean_actions = self.mean_actions_net(self._latent_sde)
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 24ed3df7..034c13d4 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -87,7 +87,7 @@ def export_network(network, network_name, path, num_inputs, latent=True):
 
     if latent:
         path_latent = os.path.join(path, network_name + "_latent.onnx")
-        model_latent = model.latent_net
+        model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net)
         model_latent.eval()
         dummy_input = torch.rand((num_inputs))
         model_traced = torch.jit.trace(model_latent, dummy_input)
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 1e885e81..dded2739 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -171,7 +171,9 @@ def set_up_logger(self):
             self.alg,
             ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
         )
-        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
+        logger.register_category(
+            "actor", self.alg.actor, ["action_std", "entropy", "get_std"]
+        )
 
         logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 

From 007ea933c6d72077c8582e44162a121490cc529d Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 6 May 2024 14:22:30 -0400
Subject: [PATCH 19/42] export actor std to txt file

---
 learning/modules/utils/neural_net.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 034c13d4..6e6f41d6 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -1,6 +1,7 @@
 import torch
 import os
 import copy
+import numpy as np
 
 
 def create_MLP(
@@ -86,9 +87,16 @@ def export_network(network, network_name, path, num_inputs, latent=True):
     torch.onnx.export(model_traced, dummy_input, path_onnx)
 
     if latent:
+        # Export latent model
         path_latent = os.path.join(path, network_name + "_latent.onnx")
         model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net)
         model_latent.eval()
         dummy_input = torch.rand((num_inputs))
         model_traced = torch.jit.trace(model_latent, dummy_input)
         torch.onnx.export(model_traced, dummy_input, path_latent)
+
+        # Save actor std of shape (num_actions, latent_dim)
+        # It is important that the shape is the same as the exploration matrix
+        path_std = os.path.join(path, network_name + "_std.txt")
+        std_transposed = model.get_std.numpy().T
+        np.savetxt(path_std, std_transposed)

From 1ae4ac007fe312a3b4efbef3937e7ba9f3d8117a Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 5 Jul 2024 16:10:31 -0400
Subject: [PATCH 20/42] cleanup branch, remove logging and plotting

---
 .gitignore                                    |   4 -
 gym/envs/mini_cheetah/mini_cheetah_config.py  |   4 +-
 gym/smooth_exploration/plot_ft.py             |  56 --------
 gym/smooth_exploration/plot_play.py           |  38 ------
 gym/smooth_exploration/plot_train.py          |  93 --------------
 scripts/log_play.py                           | 121 ------------------
 scripts/log_train.py                          | 101 ---------------
 .../sweep_config_smooth_exploration.json      |  16 ---
 8 files changed, 2 insertions(+), 431 deletions(-)
 delete mode 100644 gym/smooth_exploration/plot_ft.py
 delete mode 100644 gym/smooth_exploration/plot_play.py
 delete mode 100644 gym/smooth_exploration/plot_train.py
 delete mode 100644 scripts/log_play.py
 delete mode 100644 scripts/log_train.py
 delete mode 100644 scripts/sweep_configs/sweep_config_smooth_exploration.json

diff --git a/.gitignore b/.gitignore
index c78e6696..0b68375a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,7 +81,3 @@ ipython_config.py
 venv/
 env.bak/
 venv.bak/
-
-# Smooth exploration
-gym/smooth_exploration/data*
-gym/smooth_exploration/figures*
diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
index 6631704c..2bd7958d 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -134,13 +134,13 @@ class actor:
         exploration_sample_freq = 16
 
         obs = [
-            # "base_lin_vel",
+            "base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
             "dof_pos_obs",
             "dof_vel",
-            # "dof_pos_target",
+            "dof_pos_target",
         ]
         actions = ["dof_pos_target"]
         add_noise = True
diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py
deleted file mode 100644
index 5ad86b63..00000000
--- a/gym/smooth_exploration/plot_ft.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-import os
-
-SAMPLE_FREQ = 8
-STEPS = 500
-
-smooth_name = "ref_sample_8_len_500"
-baseline_name = "ref_baseline_len_500"
-
-smooth_data_dir = "./data_train/" + smooth_name
-baseline_data_dir = "./data_train/" + baseline_name
-fig_dir = "./figures_train/"
-
-if not os.path.exists(fig_dir):
-    os.makedirs(fig_dir)
-
-# load data
-smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0]
-baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0]
-smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0]
-baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0]
-
-# compute FFT averages
-smooth_ffts = [[], [], []]
-baseline_ffts = [[], [], []]
-for it in range(0, smooth_pos_target.shape[0], 10):
-    # only use data that didn't terminate
-    if not np.any(smooth_terminated[it, :, 0]):
-        for idx in range(3):
-            fft = np.fft.fft(smooth_pos_target[it, :, idx])
-            smooth_ffts[idx].append(fft[: len(fft) // 2])
-
-    if not np.any(baseline_terminated[it, :, 0]):
-        for idx in range(3):
-            fft = np.fft.fft(baseline_pos_target[it, :, idx])
-            baseline_ffts[idx].append(fft[: len(fft) // 2])
-
-print(f"Total smooth FFTS: {len(smooth_ffts[0])}")
-print(f"Total baseline FFTS: {len(baseline_ffts[0])}")
-
-smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)]
-baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)]
-
-# plot FFTs
-fig, axs = plt.subplots(3, 1, figsize=(10, 10))
-for idx in range(3):
-    axs[idx].plot(np.abs(smooth_fft_means[idx]))
-    axs[idx].plot(np.abs(baseline_fft_means[idx]))
-    axs[idx].set_title(f"FT Amplitude idx {idx}")
-    axs[idx].set_xlabel("Frequency")
-    axs[idx].set_ylabel("Amplitude")
-    axs[idx].legend(["smooth", "baseline"])
-
-fig.tight_layout()
-fig.savefig(fig_dir + "/" + smooth_name + ".png")
diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py
deleted file mode 100644
index 5768f91a..00000000
--- a/gym/smooth_exploration/plot_play.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-
-name = "mini_cheetah_ref"
-data_dir = "./data_play/" + name
-fig_dir = "./figures_play/" + name
-
-# load data
-dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
-dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
-dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
-
-# plot data
-n_steps = 200
-fig, axs = plt.subplots(3, figsize=(10, 10))
-plt.suptitle(name)
-
-for i in range(3):
-    axs[0].plot(dof_pos_obs[:n_steps, i])
-axs[0].set_title("dof_pos_obs")
-axs[0].legend(["idx 0", "idx 1", "idx 2"])
-axs[0].set_xlabel("time steps")
-
-for i in range(3):
-    axs[1].plot(dof_pos_target[:n_steps, i])
-axs[1].set_title("dof_pos_target")
-axs[1].legend(["idx 0", "idx 1", "idx 2"])
-axs[1].set_xlabel("time steps")
-
-for i in range(3):
-    axs[2].plot(dof_vel[:n_steps, i])
-axs[2].set_title("dof_vel")
-axs[2].legend(["idx 0", "idx 1", "idx 2"])
-axs[2].set_xlabel("time steps")
-
-plt.tight_layout()
-plt.savefig(fig_dir + "/" + name + ".png")
-plt.show()
diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py
deleted file mode 100644
index 58c50396..00000000
--- a/gym/smooth_exploration/plot_train.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-import os
-
-FOURIER = True
-SMOOTH = True
-SAMPLE_FREQ = 16
-STEPS = 1000
-
-name = "ref_sample_16_len_1000"
-data_dir = "./data_train/" + name
-fig_dir = "./figures_train/" + name
-
-if not os.path.exists(fig_dir):
-    os.makedirs(fig_dir)
-
-# load data
-dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
-dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
-dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
-torques = np.load(data_dir + "/torques.npy")[0]
-terminated = np.load(data_dir + "/terminated.npy")[0]
-
-
-# plot fourier trainsform
-def plot_fourier(data, it):
-    fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10))
-    for i in range(3):
-        ft = np.fft.fft(data[:, i])
-        ft_half = ft[: len(ft) // 2]
-        axs_ft[0].plot(np.abs(ft_half))
-        axs_ft[1].plot(np.angle(ft_half))
-
-    axs_ft[0].set_title("FT Amplitude")
-    axs_ft[0].set_xlabel("Frequency")
-    axs_ft[0].set_ylabel("Amplitude")
-    axs_ft[0].legend(["idx 0", "idx 1", "idx 2"])
-    axs_ft[1].set_title("FT Phase")
-    axs_ft[1].set_xlabel("Frequency")
-    axs_ft[1].set_ylabel("Phase")
-    axs_ft[1].legend(["idx 0", "idx 1", "idx 2"])
-
-    fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png")
-
-
-# plot data for each iteration
-for it in range(0, dof_pos_obs.shape[0], 10):
-    # check if iteration terminated
-    terminate_idx = np.where(terminated[it, :, 0] == 1)[0]
-    if terminate_idx.size > 0:
-        n_steps = terminate_idx[0]
-    else:
-        n_steps = dof_pos_obs.shape[1]
-    print(n_steps)
-
-    # generate figure
-    fig, axs = plt.subplots(4, figsize=(10, 10))
-    plt.suptitle(name + " iteration " + str(it))
-
-    axs[0].set_title("dof_pos_obs")
-    for i in range(3):
-        axs[0].plot(dof_pos_obs[it, :n_steps, i])
-
-    axs[1].set_title("dof_pos_target")
-    for i in range(3):
-        axs[1].plot(dof_pos_target[it, :n_steps, i])
-    if FOURIER and n_steps == STEPS:
-        plot_fourier(dof_pos_target[it, :n_steps, :], it)
-
-    axs[2].set_title("dof_vel")
-    for i in range(3):
-        axs[2].plot(dof_vel[it, :n_steps, i])
-
-    axs[3].set_title("torques")
-    for i in range(3):
-        axs[3].plot(torques[it, :n_steps, i])
-
-    # format plots
-    for idx in range(4):
-        axs[idx].legend(["idx 0", "idx 1", "idx 2"])
-        axs[idx].set_xlabel("time steps")
-        axs[idx].set_xlim([0, n_steps])
-
-    if SMOOTH:
-        # plot vertical lines where noise is resampled
-        for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ):
-            axs[0].axvline(x, color="r", linestyle="--")
-            axs[1].axvline(x, color="r", linestyle="--")
-            axs[2].axvline(x, color="r", linestyle="--")
-            axs[3].axvline(x, color="r", linestyle="--")
-
-    fig.tight_layout()
-    fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
diff --git a/scripts/log_play.py b/scripts/log_play.py
deleted file mode 100644
index f7fd68e8..00000000
--- a/scripts/log_play.py
+++ /dev/null
@@ -1,121 +0,0 @@
-from gym.envs import __init__  # noqa: F401
-from gym.utils import get_args, task_registry
-from gym.utils import KeyboardInterface
-from gym.utils import VisualizationRecorder
-from gym import LEGGED_GYM_ROOT_DIR
-
-# torch needs to be imported after isaacgym imports in local source
-import torch
-import os
-import numpy as np
-
-TEST_TOTAL_TIMESTEPS = 1000
-
-
-def create_logging_dict(runner, test_total_timesteps):
-    states_to_log = [
-        "dof_pos_target",
-        "dof_pos_obs",
-        "dof_vel",
-        "torques",
-        "commands",
-        # 'base_lin_vel',
-        # 'base_ang_vel',
-        # 'oscillators',
-        # 'grf',
-        # 'base_height'
-    ]
-
-    states_to_log_dict = {}
-
-    for state in states_to_log:
-        array_dim = runner.get_obs_size(
-            [
-                state,
-            ]
-        )
-        states_to_log_dict[state] = torch.zeros(
-            (1, test_total_timesteps, array_dim), device=runner.env.device
-        )
-    return states_to_log_dict
-
-
-def setup(args):
-    env_cfg, train_cfg = task_registry.create_cfgs(args)
-    env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16)
-    if hasattr(env_cfg, "push_robots"):
-        env_cfg.push_robots.toggle = False
-    if hasattr(env_cfg, "commands"):
-        env_cfg.commands.resampling_time = 9999
-    env_cfg.env.episode_length_s = 9999
-    env_cfg.env.num_projectiles = 20
-    task_registry.make_gym_and_sim()
-    env = task_registry.make_env(args.task, env_cfg)
-    env.cfg.init_state.reset_mode = "reset_to_basic"
-    train_cfg.runner.resume = True
-    train_cfg.logging.enable_local_saving = False
-    runner = task_registry.make_alg_runner(env, train_cfg)
-
-    # * switch to evaluation mode (dropout for example)
-    runner.switch_to_eval()
-    return env, runner, train_cfg
-
-
-def play(env, runner, train_cfg):
-    protocol_name = train_cfg.runner.experiment_name
-
-    # * set up logging
-    log_file_path = os.path.join(
-        LEGGED_GYM_ROOT_DIR,
-        "gym",
-        "smooth_exploration",
-        "data_play",
-        protocol_name + ".npz",
-    )
-    if not os.path.exists(os.path.dirname(log_file_path)):
-        os.makedirs(os.path.dirname(log_file_path))
-
-    states_to_log_dict = create_logging_dict(runner, TEST_TOTAL_TIMESTEPS)
-
-    # * set up recording
-    if env.cfg.viewer.record:
-        recorder = VisualizationRecorder(
-            env, train_cfg.runner.experiment_name, train_cfg.runner.load_run
-        )
-
-    # * set up interface: GamepadInterface(env) or KeyboardInterface(env)
-    COMMANDS_INTERFACE = hasattr(env, "commands")
-    if COMMANDS_INTERFACE:
-        # interface = GamepadInterface(env)
-        interface = KeyboardInterface(env)
-
-    for t in range(TEST_TOTAL_TIMESTEPS):
-        if COMMANDS_INTERFACE:
-            interface.update(env)
-        if env.cfg.viewer.record:
-            recorder.update(t)
-        runner.set_actions(
-            runner.policy_cfg["actions"],
-            runner.get_inference_actions(),
-            runner.policy_cfg["disable_actions"],
-        )
-        env.step()
-        env.check_exit()
-
-        # * log
-        for state in states_to_log_dict:
-            states_to_log_dict[state][:, t, :] = getattr(env, state)[0, :]
-
-    # * save data
-    # first convert tensors to cpu
-    log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()}
-    np.savez_compressed(log_file_path, **log_dict_cpu)
-    print("saved to ", log_file_path)
-    return states_to_log_dict
-
-
-if __name__ == "__main__":
-    args = get_args()
-    with torch.no_grad():
-        env, runner, train_cfg = setup(args)
-        play(env, runner, train_cfg)
diff --git a/scripts/log_train.py b/scripts/log_train.py
deleted file mode 100644
index 37fca0da..00000000
--- a/scripts/log_train.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from gym.envs import __init__  # noqa: F401
-from gym.utils import get_args, task_registry, randomize_episode_counters
-from gym.utils.logging_and_saving import wandb_singleton
-from gym.utils.logging_and_saving import local_code_save_helper
-from gym import LEGGED_GYM_ROOT_DIR
-
-# torch needs to be imported after isaacgym imports in local source
-import torch
-import os
-import numpy as np
-
-TRAIN_ITERATIONS = 100
-ROLLOUT_TIMESTEPS = 1000
-
-
-def create_logging_dict(runner):
-    states_to_log = [
-        "dof_pos_target",
-        "dof_pos_obs",
-        "dof_vel",
-        "torques",
-        "commands",
-        # 'base_lin_vel',
-        # 'base_ang_vel',
-        # 'oscillators',
-        # 'grf',
-        # 'base_height'
-    ]
-
-    states_to_log_dict = {}
-
-    for state in states_to_log:
-        array_dim = runner.get_obs_size(
-            [
-                state,
-            ]
-        )
-        states_to_log_dict[state] = torch.zeros(
-            (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim),
-            device=runner.env.device,
-        )
-
-    # log whether rollout terminated
-    states_to_log_dict["terminated"] = torch.zeros(
-        (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, 1), device=runner.env.device
-    )
-    return states_to_log_dict
-
-
-def setup():
-    args = get_args()
-    wandb_helper = wandb_singleton.WandbSingleton()
-
-    env_cfg, train_cfg = task_registry.create_cfgs(args)
-    task_registry.make_gym_and_sim()
-    wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args)
-    env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
-    randomize_episode_counters(env)
-
-    train_cfg.runner.max_iterations = TRAIN_ITERATIONS
-    policy_runner = task_registry.make_alg_runner(env, train_cfg)
-
-    local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir)
-
-    return train_cfg, policy_runner
-
-
-def train(train_cfg, policy_runner):
-    protocol_name = train_cfg.runner.experiment_name
-
-    # * set up logging
-    log_file_path = os.path.join(
-        LEGGED_GYM_ROOT_DIR,
-        "gym",
-        "smooth_exploration",
-        "data_train",
-        protocol_name + ".npz",
-    )
-    if not os.path.exists(os.path.dirname(log_file_path)):
-        os.makedirs(os.path.dirname(log_file_path))
-
-    states_to_log_dict = create_logging_dict(policy_runner)
-
-    # * train
-    # wandb_helper = wandb_singleton.WandbSingleton()
-
-    policy_runner.learn(states_to_log_dict)
-
-    # wandb_helper.close_wandb()
-
-    # * save data
-    # first convert tensors to cpu
-    log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()}
-    np.savez_compressed(log_file_path, **log_dict_cpu)
-    print("saved to ", log_file_path)
-    return states_to_log_dict
-
-
-if __name__ == "__main__":
-    train_cfg, policy_runner = setup()
-    train(train_cfg=train_cfg, policy_runner=policy_runner)
diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json
deleted file mode 100644
index 49fc4125..00000000
--- a/scripts/sweep_configs/sweep_config_smooth_exploration.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "method": "grid",
-    "metric": {
-        "name": "rewards/total_rewards",
-        "goal": "maximize"
-    },
-    "run_cap": 10,
-    "parameters": {
-        "train_cfg.policy.exploration_sample_freq":{
-            "values": [4, 8, 16, 32]
-        },
-        "train_cfg.runner.num_steps_per_env":{
-            "values": [32, 64]
-        }
-    }
-}
\ No newline at end of file

From 70da359c9f20f908138fe50eabf7019c8b05d7d1 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 8 Jul 2024 15:01:55 -0400
Subject: [PATCH 21/42] PPO, PPO-smooth and IPG all learn (IPG-smooth kind of)

---
 .../mini_cheetah/mini_cheetah_ref_config.py   |  29 +-
 learning/algorithms/__init__.py               |   1 +
 learning/algorithms/ppo2.py                   |   2 +-
 learning/algorithms/ppo_ipg.py                | 271 +++++++++++++++++
 learning/runners/__init__.py                  |   3 +-
 learning/runners/ipg_runner.py                | 274 ++++++++++++++++++
 learning/runners/on_policy_runner.py          |  84 ++----
 learning/storage/__init__.py                  |   3 +-
 learning/storage/replay_buffer.py             |  54 ++++
 learning/utils/__init__.py                    |   5 +-
 learning/utils/utils.py                       |   6 +
 11 files changed, 650 insertions(+), 82 deletions(-)
 create mode 100644 learning/algorithms/ppo_ipg.py
 create mode 100644 learning/runners/ipg_runner.py
 create mode 100644 learning/storage/replay_buffer.py

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 4f50f7d2..eba8c34a 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -67,13 +67,13 @@ class scaling(MiniCheetahCfg.scaling):
 
 class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg):
     seed = -1
-    runner_class_name = "OnPolicyRunner"
+    runner_class_name = "IPGRunner"
 
     class actor:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-        smooth_exploration = True
+        smooth_exploration = False
         exploration_sample_freq = 16
 
         normalize_obs = True
@@ -142,6 +142,8 @@ class termination_weight:
 
     class algorithm(MiniCheetahRunnerCfg.algorithm):
         # training params
+        gamma = 0.99
+        lam = 0.95
         value_loss_coef = 1.0  # deprecate for PPO2
         use_clipped_value_loss = True  # deprecate for PPO2
         clip_param = 0.2
@@ -149,22 +151,25 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         num_learning_epochs = 6
         # mini batch size = num_envs*nsteps/nminibatches
         num_mini_batches = 4
-        storage_size = 2**17  # new
         mini_batch_size = 2**15  #  new
-        discount_horizon = 1.0  # [s]
-        lam = 0.95
-        GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.02
+
+        desired_kl = 0.02  # 0.02 for smooth-expl, else 0.01
         max_grad_norm = 1.0
-        # * Learning rate
-        learning_rate = 0.002
+        learning_rate = 0.002  # initial value
         schedule = "adaptive"  # can be adaptive or fixed
         lr_range = [3e-4, 1e-2]
-        lr_ratio = 1.3
+        lr_ratio = 1.3  # 1.3 for smooth-expl, else 1.5
+
+        # IPG
+        storage_size = 4 * 32 * 4096  # old_policies*nsteps*num_envs
+        polyak = 0.995
+        use_cv = False
+        inter_nu = 0.2
+        beta = "off_policy"
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
-        max_iterations = 800  # number of policy updates
-        algorithm_class_name = "PPO2"
+        max_iterations = 500  # number of policy updates
+        algorithm_class_name = "PPO_IPG"
         num_steps_per_env = 32
diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py
index ace1b9fe..b775a439 100644
--- a/learning/algorithms/__init__.py
+++ b/learning/algorithms/__init__.py
@@ -32,4 +32,5 @@
 
 from .ppo import PPO
 from .ppo2 import PPO2
+from .ppo_ipg import PPO_IPG
 from .SE import StateEstimator
diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index 528ef882..e5912e3f 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -63,7 +63,7 @@ def switch_to_train(self):
         self.actor.train()
         self.critic.train()
 
-    def act(self, obs, critic_obs):
+    def act(self, obs):
         return self.actor.act(obs).detach()
 
     def update(self, data, last_obs=None):
diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py
new file mode 100644
index 00000000..ba92b575
--- /dev/null
+++ b/learning/algorithms/ppo_ipg.py
@@ -0,0 +1,271 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from learning.utils import (
+    create_uniform_generator,
+    compute_generalized_advantages,
+    polyak_update,
+)
+from learning.modules import SmoothActor
+
+
+class PPO_IPG:
+    def __init__(
+        self,
+        actor,
+        critic_v,
+        critic_q,
+        target_critic_q,
+        num_learning_epochs=1,
+        num_mini_batches=1,
+        clip_param=0.2,
+        gamma=0.998,
+        lam=0.95,
+        entropy_coef=0.0,
+        learning_rate=1e-3,
+        max_grad_norm=1.0,
+        use_clipped_value_loss=True,
+        schedule="fixed",
+        desired_kl=0.01,
+        polyak=0.995,
+        use_cv=False,
+        inter_nu=0.2,
+        beta="off_policy",
+        device="cpu",
+        **kwargs,
+    ):
+        self.device = device
+
+        self.desired_kl = desired_kl
+        self.schedule = schedule
+        self.learning_rate = learning_rate
+
+        # * PPO components
+        self.actor = actor.to(self.device)
+        self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate)
+        self.critic_v = critic_v.to(self.device)
+        self.critic_v_optimizer = optim.Adam(
+            self.critic_v.parameters(), lr=learning_rate
+        )
+
+        # * IPG components
+        self.critic_q = critic_q.to(self.device)
+        self.critic_q_optimizer = optim.Adam(
+            self.critic_q.parameters(), lr=learning_rate
+        )
+        self.target_critic_q = target_critic_q.to(self.device)
+        self.target_critic_q.load_state_dict(self.critic_q.state_dict())
+
+        # * PPO parameters
+        self.clip_param = clip_param
+        self.num_learning_epochs = num_learning_epochs
+        self.num_mini_batches = num_mini_batches
+        self.entropy_coef = entropy_coef
+        self.gamma = gamma
+        self.lam = lam
+        self.max_grad_norm = max_grad_norm
+        self.use_clipped_value_loss = use_clipped_value_loss
+
+        # * IPG parameters
+        self.polyak = polyak
+        self.use_cv = use_cv
+        self.inter_nu = inter_nu
+        self.beta = beta
+
+    def switch_to_train(self):
+        self.actor.train()
+        self.critic_v.train()
+        self.critic_q.train()
+
+    def act(self, obs):
+        return self.actor.act(obs).detach()
+
+    def update(self, data_onpol, data_offpol, last_obs=None):
+        # On-policy GAE
+        if last_obs is None:
+            last_values = None
+        else:
+            with torch.no_grad():
+                last_values = self.critic_v.evaluate(last_obs).detach()
+        compute_generalized_advantages(
+            data_onpol, self.gamma, self.lam, self.critic_v, last_values
+        )
+
+        self.update_critic_q(data_offpol)
+
+        # Freeze Q-network params
+        for param in self.critic_q.parameters():
+            param.requires_grad = False
+
+        self.update_actor(data_onpol, data_offpol)
+        self.update_critic_v(data_onpol)
+
+        # Un-freeze Q-network params
+        for param in self.critic_q.parameters():
+            param.requires_grad = True
+
+        # print("On-policy data shape: ", data_onpol.shape)
+        # print("Off-policy data shape: ", data_offpol.shape)
+
+    def update_critic_q(self, data):
+        self.mean_q_loss = 0
+        counter = 0
+
+        n, m = data.shape
+        total_data = n * m
+        batch_size = total_data // self.num_mini_batches
+        generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
+        for batch in generator:
+            with torch.no_grad():
+                # TODO: check that should be inference
+                action_next = self.actor.act_inference(batch["next_actor_obs"])
+                q_input_next = torch.cat(
+                    (batch["next_critic_obs"], action_next), dim=-1
+                )
+                q_next = self.target_critic_q.evaluate(q_input_next)
+                q_target = (
+                    batch["rewards"]
+                    + self.gamma * batch["dones"].logical_not() * q_next
+                )
+            q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1)
+            q_value = self.critic_q.evaluate(q_input)
+            q_loss = self.critic_q.loss_fn(q_value, q_target)
+            self.critic_q_optimizer.zero_grad()
+            q_loss.backward()
+            nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm)
+            self.critic_q_optimizer.step()
+            self.mean_q_loss += q_loss.item()
+            counter += 1
+
+            # TODO: check where to do polyak update (IPG repo does it here)
+            self.target_critic_q = polyak_update(
+                self.critic_q, self.target_critic_q, self.polyak
+            )
+        self.mean_q_loss /= counter
+
+    def update_critic_v(self, data):
+        self.mean_value_loss = 0
+        counter = 0
+
+        n, m = data.shape
+        total_data = n * m
+        batch_size = total_data // self.num_mini_batches
+        generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
+        for batch in generator:
+            value_batch = self.critic_v.evaluate(batch["critic_obs"])
+            value_loss = self.critic_v.loss_fn(value_batch, batch["returns"])
+            self.critic_v_optimizer.zero_grad()
+            value_loss.backward()
+            nn.utils.clip_grad_norm_(self.critic_v.parameters(), self.max_grad_norm)
+            self.critic_v_optimizer.step()
+            self.mean_value_loss += value_loss.item()
+            counter += 1
+        self.mean_value_loss /= counter
+
+    def update_actor(self, data_onpol, data_offpol):
+        self.mean_surrogate_loss = 0
+        self.mean_offpol_loss = 0
+        counter = 0
+
+        self.actor.update_distribution(data_onpol["actor_obs"])
+        data_onpol["old_sigma"] = self.actor.action_std.detach()
+        data_onpol["old_mu"] = self.actor.action_mean.detach()
+        data_onpol["old_actions_log_prob"] = self.actor.get_actions_log_prob(
+            data_onpol["actions"]
+        ).detach()
+
+        # Generate off-policy batches and use all on-policy data
+        n, m = data_offpol.shape
+        total_data = n * m
+        batch_size = total_data // self.num_mini_batches
+        generator_offpol = create_uniform_generator(
+            data_offpol, batch_size, self.num_learning_epochs
+        )
+        for batch_offpol in generator_offpol:
+            # * Re-sample noise for smooth actor
+            if isinstance(self.actor, SmoothActor):
+                self.actor.sample_weights(batch_size)
+
+            self.actor.update_distribution(data_onpol["actor_obs"])
+            actions_log_prob_onpol = self.actor.get_actions_log_prob(
+                data_onpol["actions"]
+            )
+            mu_onpol = self.actor.action_mean
+            sigma_onpol = self.actor.action_std
+
+            # * KL
+            if self.desired_kl is not None and self.schedule == "adaptive":
+                with torch.inference_mode():
+                    kl = torch.sum(
+                        torch.log(sigma_onpol / data_onpol["old_sigma"] + 1.0e-5)
+                        + (
+                            torch.square(data_onpol["old_sigma"])
+                            + torch.square(data_onpol["old_mu"] - mu_onpol)
+                        )
+                        / (2.0 * torch.square(sigma_onpol))
+                        - 0.5,
+                        axis=-1,
+                    )
+                    kl_mean = torch.mean(kl)
+
+                    if kl_mean > self.desired_kl * 2.0:
+                        self.learning_rate = max(1e-5, self.learning_rate / 1.5)
+                    elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
+                        self.learning_rate = min(1e-2, self.learning_rate * 1.5)
+
+                    for param_group in self.optimizer.param_groups:
+                        # ! check this
+                        param_group["lr"] = self.learning_rate
+
+            # * On-policy surrogate loss
+            adv_onpol = data_onpol["advantages"]
+            if self.use_cv:
+                # TODO: control variate
+                critic_based_adv = 0  # get_control_variate(data_onpol, self.critic_v)
+                learning_signals = (adv_onpol - critic_based_adv) * (1 - self.inter_nu)
+            else:
+                learning_signals = adv_onpol * (1 - self.inter_nu)
+
+            ratio = torch.exp(
+                actions_log_prob_onpol - data_onpol["old_actions_log_prob"]
+            )
+            ratio_clipped = torch.clamp(
+                ratio, 1.0 - self.clip_param, 1.0 + self.clip_param
+            )
+            surrogate = -learning_signals * ratio
+            surrogate_clipped = -learning_signals * ratio_clipped
+            loss_onpol = torch.max(surrogate, surrogate_clipped).mean()
+
+            # * Off-policy loss
+            if self.beta == "on_policy":
+                loss_offpol = self.compute_loss_offpol(data_onpol)
+            elif self.beta == "off_policy":
+                loss_offpol = self.compute_loss_offpol(batch_offpol)
+            else:
+                raise ValueError(f"Invalid beta value: {self.beta}")
+
+            if self.use_cv:
+                b = 1
+            else:
+                b = self.inter_nu
+
+            loss = loss_onpol + b * loss_offpol
+
+            # * Gradient step
+            self.optimizer.zero_grad()
+            loss.backward()
+            nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
+            self.optimizer.step()
+            self.mean_surrogate_loss += loss_onpol.item()
+            self.mean_offpol_loss += loss_offpol.item()
+            counter += 1
+        self.mean_surrogate_loss /= counter
+        self.mean_offpol_loss /= counter
+
+    def compute_loss_offpol(self, data):
+        obs = data["actor_obs"]
+        actions = self.actor.act_inference(obs)
+        q_input = torch.cat((data["critic_obs"], actions), dim=-1)
+        q_value = self.critic_q.evaluate(q_input)
+        return -q_value.mean()
diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py
index b0f49217..a2cc7576 100644
--- a/learning/runners/__init__.py
+++ b/learning/runners/__init__.py
@@ -32,4 +32,5 @@
 
 from .on_policy_runner import OnPolicyRunner
 from .my_runner import MyRunner
-from .old_policy_runner import OldPolicyRunner
\ No newline at end of file
+from .old_policy_runner import OldPolicyRunner
+from .ipg_runner import IPGRunner
\ No newline at end of file
diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py
new file mode 100644
index 00000000..ecc8a39b
--- /dev/null
+++ b/learning/runners/ipg_runner.py
@@ -0,0 +1,274 @@
+import os
+import torch
+import torch.nn as nn
+from tensordict import TensorDict
+
+from learning.utils import Logger
+
+from .BaseRunner import BaseRunner
+from learning.modules import Actor, SmoothActor, Critic
+from learning.storage import DictStorage, ReplayBuffer
+from learning.algorithms import PPO_IPG
+
+logger = Logger()
+storage_onpol = DictStorage()
+storage_offpol = ReplayBuffer()
+
+
+class IPGRunner(BaseRunner):
+    def __init__(self, env, train_cfg, device="cpu"):
+        super().__init__(env, train_cfg, device)
+        logger.initialize(
+            self.env.num_envs,
+            self.env.dt,
+            self.cfg["max_iterations"],
+            self.device,
+        )
+
+    def _set_up_alg(self):
+        alg_class = eval(self.cfg["algorithm_class_name"])
+        if alg_class != PPO_IPG:
+            raise ValueError("IPGRunner only supports PPO_IPG")
+
+        num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
+        num_actions = self.get_action_size(self.actor_cfg["actions"])
+        num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
+        if self.actor_cfg["smooth_exploration"]:
+            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+        else:
+            actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        critic_v = Critic(num_critic_obs, **self.critic_cfg)
+        critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+        target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+        self.alg = alg_class(
+            actor,
+            critic_v,
+            critic_q,
+            target_critic_q,
+            device=self.device,
+            **self.alg_cfg,
+        )
+
+    def learn(self):
+        self.set_up_logger()
+
+        rewards_dict = {}
+
+        self.alg.switch_to_train()
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        tot_iter = self.it + self.num_learning_iterations
+        self.save()
+
+        # * Initialize smooth exploration matrices
+        if self.actor_cfg["smooth_exploration"]:
+            self.alg.actor.sample_weights(batch_size=self.env.num_envs)
+
+        # * start up both on- and off-policy storage
+        transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
+        transition.update(
+            {
+                "actor_obs": actor_obs,
+                "next_actor_obs": actor_obs,
+                "actions": self.alg.act(actor_obs),
+                "critic_obs": critic_obs,
+                "next_critic_obs": critic_obs,
+                "rewards": self.get_rewards({"termination": 0.0})["termination"],
+                "dones": self.get_timed_out(),
+            }
+        )
+        storage_onpol.initialize(
+            transition,
+            self.env.num_envs,
+            self.env.num_envs * self.num_steps_per_env,
+            device=self.device,
+        )
+        storage_offpol.initialize(
+            transition,
+            self.env.num_envs,
+            self.alg_cfg["storage_size"],
+            device=self.device,
+        )
+
+        # burn in observation normalization.
+        if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]:
+            self.burn_in_normalization()
+
+        logger.tic("runtime")
+        for self.it in range(self.it + 1, tot_iter + 1):
+            logger.tic("iteration")
+            logger.tic("collection")
+            # * Rollout
+            with torch.inference_mode():
+                for i in range(self.num_steps_per_env):
+                    # * Re-sample noise matrix for smooth exploration
+                    sample_freq = self.actor_cfg["exploration_sample_freq"]
+                    if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
+                        self.alg.actor.sample_weights(batch_size=self.env.num_envs)
+
+                    actions = self.alg.act(actor_obs)
+                    self.set_actions(
+                        self.actor_cfg["actions"],
+                        actions,
+                        self.actor_cfg["disable_actions"],
+                    )
+
+                    transition.update(
+                        {
+                            "actor_obs": actor_obs,
+                            "actions": actions,
+                            "critic_obs": critic_obs,
+                        }
+                    )
+
+                    self.env.step()
+
+                    actor_obs = self.get_noisy_obs(
+                        self.actor_cfg["obs"], self.actor_cfg["noise"]
+                    )
+                    critic_obs = self.get_obs(self.critic_cfg["obs"])
+
+                    # * get time_outs
+                    timed_out = self.get_timed_out()
+                    terminated = self.get_terminated()
+                    dones = timed_out | terminated
+
+                    self.update_rewards(rewards_dict, terminated)
+                    total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0)
+
+                    transition.update(
+                        {
+                            "next_actor_obs": actor_obs,
+                            "next_critic_obs": critic_obs,
+                            "rewards": total_rewards,
+                            "timed_out": timed_out,
+                            "dones": dones,
+                        }
+                    )
+                    # add transition to both storages
+                    storage_onpol.add_transitions(transition)
+                    storage_offpol.add_transitions(transition)
+
+                    logger.log_rewards(rewards_dict)
+                    logger.log_rewards({"total_rewards": total_rewards})
+                    logger.finish_step(dones)
+            logger.toc("collection")
+
+            logger.tic("learning")
+            self.alg.update(storage_onpol.data, storage_offpol.get_data())
+            storage_onpol.clear()  # only clear on-policy storage
+            logger.toc("learning")
+            logger.log_all_categories()
+
+            logger.finish_iteration()
+            logger.toc("iteration")
+            logger.toc("runtime")
+            logger.print_to_terminal()
+
+            if self.it % self.save_interval == 0:
+                self.save()
+        self.save()
+
+    @torch.no_grad
+    def burn_in_normalization(self, n_iterations=200):
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+        for _ in range(n_iterations):
+            actions = self.alg.act(actor_obs)
+            self.set_actions(self.actor_cfg["actions"], actions)
+            self.env.step()
+            actor_obs = self.get_noisy_obs(
+                self.actor_cfg["obs"], self.actor_cfg["noise"]
+            )
+            critic_obs = self.get_obs(self.critic_cfg["obs"])
+            self.alg.critic_v.evaluate(critic_obs)
+            q_input = torch.cat((critic_obs, actions), dim=-1)
+            self.alg.critic_q.evaluate(q_input)
+            self.alg.target_critic_q.evaluate(q_input)
+        # self.env.reset()
+
+    def update_rewards(self, rewards_dict, terminated):
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["termination_weight"], mask=terminated
+            )
+        )
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["weights"],
+                modifier=self.env.dt,
+                mask=~terminated,
+            )
+        )
+
+    def set_up_logger(self):
+        logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys()))
+        logger.register_rewards(
+            list(self.critic_cfg["reward"]["termination_weight"].keys())
+        )
+        logger.register_rewards(["total_rewards"])
+        logger.register_category(
+            "algorithm",
+            self.alg,
+            [
+                "mean_value_loss",
+                "mean_surrogate_loss",
+                "learning_rate",
+                # IPG specific
+                "mean_q_loss",
+                "mean_offpol_loss",
+            ],
+        )
+        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
+
+        logger.attach_torch_obj_to_wandb(
+            (self.alg.actor, self.alg.critic_v, self.alg.critic_q)
+        )
+
+    def save(self):
+        os.makedirs(self.log_dir, exist_ok=True)
+        path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
+        torch.save(
+            {
+                "actor_state_dict": self.alg.actor.state_dict(),
+                "critic_v_state_dict": self.alg.critic_v.state_dict(),
+                "critic_q_state_dict": self.alg.critic_q.state_dict(),
+                "optimizer_state_dict": self.alg.optimizer.state_dict(),
+                "critic_v_opt_state_dict": self.alg.critic_v_optimizer.state_dict(),
+                "critic_q_opt_state_dict": self.alg.critic_q_optimizer.state_dict(),
+                "iter": self.it,
+            },
+            path,
+        )
+
+    def load(self, path, load_optimizer=True, load_actor_std=True):
+        loaded_dict = torch.load(path)
+        if load_actor_std:
+            self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"])
+        else:
+            std_init = self.alg.actor.std.detach().clone()
+            self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"])
+            self.alg.actor.std = nn.Parameter(std_init)
+        self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"])
+        self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"])
+        if load_optimizer:
+            self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
+            self.alg.critic_v_optimizer.load_state_dict(
+                loaded_dict["critic_v_opt_state_dict"]
+            )
+            self.alg.critic_q_optimizer.load_state_dict(
+                loaded_dict["critic_q_opt_state_dict"]
+            )
+        self.it = loaded_dict["iter"]
+
+    def switch_to_eval(self):
+        self.alg.actor.eval()
+        self.alg.critic_v.eval()
+        self.alg.critic_q.eval()
+
+    def get_inference_actions(self):
+        obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"])
+        return self.alg.actor.act_inference(obs)
+
+    def export(self, path):
+        self.alg.actor.export(path)
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index dded2739..e5f000f8 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -41,7 +41,7 @@ def learn(self, states_to_log_dict=None):
         transition.update(
             {
                 "actor_obs": actor_obs,
-                "actions": self.alg.act(actor_obs, critic_obs),
+                "actions": self.alg.act(actor_obs),
                 "critic_obs": critic_obs,
                 "rewards": self.get_rewards({"termination": 0.0})["termination"],
                 "dones": self.get_timed_out(),
@@ -55,30 +55,13 @@ def learn(self, states_to_log_dict=None):
         )
 
         # burn in observation normalization.
-        with torch.inference_mode():
-            for _ in range(200):
-                actions = self.alg.act(actor_obs, critic_obs)
-                self.set_actions(
-                    self.actor_cfg["actions"],
-                    actions,
-                    self.actor_cfg["disable_actions"],
-                )
-                self.env.step()
-                actor_obs = self.get_noisy_obs(
-                    self.actor_cfg["obs"], self.actor_cfg["noise"]
-                )
+        if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]:
+            self.burn_in_normalization()
 
         logger.tic("runtime")
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
-
-            # * Simulate environment and log states
-            if states_to_log_dict is not None:
-                it_idx = self.it - 1
-                if it_idx % 10 == 0:
-                    self.sim_and_log_states(states_to_log_dict, it_idx)
-
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
@@ -87,7 +70,7 @@ def learn(self, states_to_log_dict=None):
                     if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
                         self.alg.actor.sample_weights(batch_size=self.env.num_envs)
 
-                    actions = self.alg.act(actor_obs, critic_obs)
+                    actions = self.alg.act(actor_obs)
                     self.set_actions(
                         self.actor_cfg["actions"],
                         actions,
@@ -146,6 +129,22 @@ def learn(self, states_to_log_dict=None):
                 self.save()
         self.save()
 
+    @torch.no_grad
+    def burn_in_normalization(self, n_iterations=200):
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        # critic_obs = self.get_obs(self.critic_cfg["obs"])
+        for _ in range(n_iterations):
+            actions = self.alg.act(actor_obs)
+            self.set_actions(self.actor_cfg["actions"], actions)
+            self.env.step()
+            actor_obs = self.get_noisy_obs(
+                self.actor_cfg["obs"], self.actor_cfg["noise"]
+            )
+            # TODO: check this, trains better without evaluating critic
+            # critic_obs = self.get_obs(self.critic_cfg["obs"])
+            # self.alg.critic.evaluate(critic_obs)
+        # self.env.reset()
+
     def update_rewards(self, rewards_dict, terminated):
         rewards_dict.update(
             self.get_rewards(
@@ -171,9 +170,7 @@ def set_up_logger(self):
             self.alg,
             ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
         )
-        logger.register_category(
-            "actor", self.alg.actor, ["action_std", "entropy", "get_std"]
-        )
+        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
 
         logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 
@@ -212,42 +209,3 @@ def get_inference_actions(self):
 
     def export(self, path):
         self.alg.actor.export(path)
-
-    def sim_and_log_states(self, states_to_log_dict, it_idx):
-        # Simulate environment for as many steps as expected in the dict.
-        # Log states to the dict, as well as whether the env terminated.
-        steps = states_to_log_dict["terminated"].shape[2]
-        actor_obs = self.get_obs(self.policy_cfg["actor_obs"])
-        critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
-
-        with torch.inference_mode():
-            for i in range(steps):
-                sample_freq = self.policy_cfg["exploration_sample_freq"]
-                if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0:
-                    self.alg.actor_critic.actor.sample_weights(
-                        batch_size=self.env.num_envs
-                    )
-
-                actions = self.alg.act(actor_obs, critic_obs)
-                self.set_actions(
-                    self.policy_cfg["actions"],
-                    actions,
-                    self.policy_cfg["disable_actions"],
-                )
-
-                self.env.step()
-
-                actor_obs = self.get_noisy_obs(
-                    self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
-                )
-                critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
-
-                # Log states (just for the first env)
-                terminated = self.get_terminated()[0]
-                for state in states_to_log_dict:
-                    if state == "terminated":
-                        states_to_log_dict[state][0, it_idx, i, :] = terminated
-                    else:
-                        states_to_log_dict[state][0, it_idx, i, :] = getattr(
-                            self.env, state
-                        )[0, :]
diff --git a/learning/storage/__init__.py b/learning/storage/__init__.py
index f10df85f..47f2593b 100644
--- a/learning/storage/__init__.py
+++ b/learning/storage/__init__.py
@@ -3,4 +3,5 @@
 
 from .rollout_storage import RolloutStorage
 from .SE_storage import SERolloutStorage
-from .dict_storage import DictStorage
\ No newline at end of file
+from .dict_storage import DictStorage
+from .replay_buffer import ReplayBuffer
\ No newline at end of file
diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py
new file mode 100644
index 00000000..350a2c25
--- /dev/null
+++ b/learning/storage/replay_buffer.py
@@ -0,0 +1,54 @@
+import torch
+from tensordict import TensorDict
+
+
+class ReplayBuffer:
+    def __init__(self):
+        self.initialized = False
+
+    def initialize(
+        self,
+        dummy_dict,
+        num_envs=2**12,
+        max_storage=2**17,
+        device="cpu",
+    ):
+        self.device = device
+        self.num_envs = num_envs
+        max_length = max_storage // num_envs
+        self.max_length = max_length
+        self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device)
+        self.fill_count = 0
+        self.add_index = 0
+
+        for key in dummy_dict.keys():
+            if dummy_dict[key].dim() == 1:  # if scalar
+                self.data[key] = torch.zeros(
+                    (max_length, num_envs),
+                    dtype=dummy_dict[key].dtype,
+                    device=self.device,
+                )
+            else:
+                self.data[key] = torch.zeros(
+                    (max_length, num_envs, dummy_dict[key].shape[1]),
+                    dtype=dummy_dict[key].dtype,
+                    device=self.device,
+                )
+
+    @torch.inference_mode
+    def add_transitions(self, transition: TensorDict):
+        if self.fill_count >= self.max_length and self.add_index >= self.max_length:
+            self.add_index = 0
+        self.data[self.add_index] = transition
+        self.fill_count += 1
+        self.add_index += 1
+
+    def get_data(self):
+        return self.data[: min(self.fill_count, self.max_length), :]
+
+    def clear(self):
+        self.fill_count = 0
+        self.add_index = 0
+        with torch.inference_mode():
+            for tensor in self.data:
+                tensor.zero_()
diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py
index 15c674d6..ec798653 100644
--- a/learning/utils/__init__.py
+++ b/learning/utils/__init__.py
@@ -1,8 +1,5 @@
 
-from .utils import (
-    remove_zero_weighted_rewards,
-    set_discount_from_horizon,
-)
+from .utils import *
 from .dict_utils import *
 from .logger import Logger
 from .PBRS.PotentialBasedRewardShaping import PotentialBasedRewardShaping
\ No newline at end of file
diff --git a/learning/utils/utils.py b/learning/utils/utils.py
index 7a54c65a..d4eac980 100644
--- a/learning/utils/utils.py
+++ b/learning/utils/utils.py
@@ -17,3 +17,9 @@ def set_discount_from_horizon(dt, horizon):
         discount_factor = 1 - 1 / discrete_time_horizon
 
     return discount_factor
+
+
+def polyak_update(online, target, polyak_factor):
+    for op, tp in zip(online.parameters(), target.parameters()):
+        tp.data.copy_((1.0 - polyak_factor) * op.data + polyak_factor * tp.data)
+    return target

From d0106332e495ce1e503165d6301f01a3343328bb Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 8 Jul 2024 15:45:00 -0400
Subject: [PATCH 22/42] IPG-smooth learns decently (adapt LR)

---
 gym/envs/mini_cheetah/mini_cheetah_ref_config.py |  5 ++---
 learning/algorithms/ppo2.py                      |  3 +--
 learning/algorithms/ppo_ipg.py                   | 13 +++++++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index eba8c34a..e4ceada5 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -73,7 +73,7 @@ class actor:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-        smooth_exploration = False
+        smooth_exploration = True
         exploration_sample_freq = 16
 
         normalize_obs = True
@@ -151,7 +151,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         num_learning_epochs = 6
         # mini batch size = num_envs*nsteps/nminibatches
         num_mini_batches = 4
-        mini_batch_size = 2**15  #  new
 
         desired_kl = 0.02  # 0.02 for smooth-expl, else 0.01
         max_grad_norm = 1.0
@@ -170,6 +169,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
-        max_iterations = 500  # number of policy updates
+        max_iterations = 700  # number of policy updates
         algorithm_class_name = "PPO_IPG"
         num_steps_per_env = 32
diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index e5912e3f..7f4bb041 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -25,10 +25,9 @@ def __init__(
         use_clipped_value_loss=True,
         schedule="fixed",
         desired_kl=0.01,
-        loss_fn="MSE",
-        device="cpu",
         lr_range=[1e-4, 1e-2],
         lr_ratio=1.3,
+        device="cpu",
         **kwargs,
     ):
         self.device = device
diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py
index ba92b575..cd73215d 100644
--- a/learning/algorithms/ppo_ipg.py
+++ b/learning/algorithms/ppo_ipg.py
@@ -28,6 +28,8 @@ def __init__(
         use_clipped_value_loss=True,
         schedule="fixed",
         desired_kl=0.01,
+        lr_range=[1e-4, 1e-2],
+        lr_ratio=1.3,
         polyak=0.995,
         use_cv=False,
         inter_nu=0.2,
@@ -40,6 +42,8 @@ def __init__(
         self.desired_kl = desired_kl
         self.schedule = schedule
         self.learning_rate = learning_rate
+        self.lr_range = lr_range
+        self.lr_ratio = lr_ratio
 
         # * PPO components
         self.actor = actor.to(self.device)
@@ -208,11 +212,16 @@ def update_actor(self, data_onpol, data_offpol):
                         axis=-1,
                     )
                     kl_mean = torch.mean(kl)
+                    lr_min, lr_max = self.lr_range
 
                     if kl_mean > self.desired_kl * 2.0:
-                        self.learning_rate = max(1e-5, self.learning_rate / 1.5)
+                        self.learning_rate = max(
+                            lr_min, self.learning_rate / self.lr_ratio
+                        )
                     elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
-                        self.learning_rate = min(1e-2, self.learning_rate * 1.5)
+                        self.learning_rate = min(
+                            lr_max, self.learning_rate * self.lr_ratio
+                        )
 
                     for param_group in self.optimizer.param_groups:
                         # ! check this

From 35a65c2807b3e71d9d0c03c9c017f82b46f27fc6 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 10 Jul 2024 16:24:44 -0400
Subject: [PATCH 23/42] refactor for changes on dev

---
 gym/envs/base/legged_robot_config.py          | 27 +++++---
 gym/envs/mini_cheetah/mini_cheetah_config.py  | 18 +----
 .../mini_cheetah/mini_cheetah_ref_config.py   | 20 +-----
 learning/algorithms/ppo2.py                   | 64 +++++++-----------
 learning/algorithms/ppo_ipg.py                | 65 +++++++++----------
 learning/modules/critic.py                    | 15 +++--
 learning/runners/ipg_runner.py                | 14 ++--
 learning/runners/on_policy_runner.py          | 60 +++++++++++++++--
 learning/utils/dict_utils.py                  | 50 +++++++-------
 scripts/export_policy.py                      |  1 +
 10 files changed, 177 insertions(+), 157 deletions(-)

diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py
index 1f6462ce..2e73d924 100644
--- a/gym/envs/base/legged_robot_config.py
+++ b/gym/envs/base/legged_robot_config.py
@@ -289,20 +289,27 @@ class termination_weight:
                 termination = 0.01
 
     class algorithm:
-        # * training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
+        # both
+        gamma = 0.99
+        lam = 0.95
+        # shared
+        batch_size = 2**15
+        max_gradient_steps = 24
+        # new
+        storage_size = 2**17  # new
+        batch_size = 2**15  #  new
+
         clip_param = 0.2
-        entropy_coef = 0.01
-        num_learning_epochs = 5
-        # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 4
         learning_rate = 1.0e-3
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
+        entropy_coef = 0.01
         schedule = "adaptive"  # could be adaptive, fixed
-        gamma = 0.99
-        lam = 0.95
         desired_kl = 0.01
-        max_grad_norm = 1.0
+        lr_range = [2e-4, 1e-2]
+        lr_ratio = 1.3
 
     class runner:
         policy_class_name = "ActorCritic"
diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
index 2bd7958d..6de32218 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -192,23 +192,7 @@ class termination_weight:
                 termination = 0.01
 
     class algorithm(LeggedRobotRunnerCfg.algorithm):
-        # * training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
-        clip_param = 0.2
-        entropy_coef = 0.01
-        num_learning_epochs = 6
-        # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 4
-        discount_horizon = 1.0  # [s]
-        # GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.02
-        max_grad_norm = 1.0
-        # * Learning rate
-        learning_rate = 0.002
-        schedule = "adaptive"  # can be adaptive or fixed
-        lr_range = [2e-4, 1e-2]
-        lr_ratio = 1.3
+        pass
 
     class runner(LeggedRobotRunnerCfg.runner):
         run_name = ""
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index e4ceada5..30e614b0 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -141,30 +141,14 @@ class termination_weight:
                 termination = 0.15
 
     class algorithm(MiniCheetahRunnerCfg.algorithm):
-        # training params
-        gamma = 0.99
-        lam = 0.95
-        value_loss_coef = 1.0  # deprecate for PPO2
-        use_clipped_value_loss = True  # deprecate for PPO2
-        clip_param = 0.2
-        entropy_coef = 0.01
-        num_learning_epochs = 6
-        # mini batch size = num_envs*nsteps/nminibatches
-        num_mini_batches = 4
-
-        desired_kl = 0.02  # 0.02 for smooth-expl, else 0.01
-        max_grad_norm = 1.0
-        learning_rate = 0.002  # initial value
-        schedule = "adaptive"  # can be adaptive or fixed
-        lr_range = [3e-4, 1e-2]
-        lr_ratio = 1.3  # 1.3 for smooth-expl, else 1.5
+        desired_kl = 0.02  # 0.02 for smooth-exploration, else 0.01
 
         # IPG
-        storage_size = 4 * 32 * 4096  # old_policies*nsteps*num_envs
         polyak = 0.995
         use_cv = False
         inter_nu = 0.2
         beta = "off_policy"
+        storage_size = 4 * 32 * 4096  # num_policies*num_stpes*num_envs
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index 7f4bb041..6c64e334 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -5,8 +5,8 @@
 from learning.utils import (
     create_uniform_generator,
     compute_generalized_advantages,
+    normalize,
 )
-from learning.modules import SmoothActor
 
 
 class PPO2:
@@ -14,8 +14,8 @@ def __init__(
         self,
         actor,
         critic,
-        num_learning_epochs=1,
-        num_mini_batches=1,
+        batch_size=2**15,
+        max_gradient_steps=10,
         clip_param=0.2,
         gamma=0.998,
         lam=0.95,
@@ -25,9 +25,9 @@ def __init__(
         use_clipped_value_loss=True,
         schedule="fixed",
         desired_kl=0.01,
+        device="cpu",
         lr_range=[1e-4, 1e-2],
         lr_ratio=1.3,
-        device="cpu",
         **kwargs,
     ):
         self.device = device
@@ -46,18 +46,14 @@ def __init__(
 
         # * PPO parameters
         self.clip_param = clip_param
-        self.num_learning_epochs = num_learning_epochs
-        self.num_mini_batches = num_mini_batches
+        self.batch_size = batch_size
+        self.max_gradient_steps = max_gradient_steps
         self.entropy_coef = entropy_coef
         self.gamma = gamma
         self.lam = lam
         self.max_grad_norm = max_grad_norm
         self.use_clipped_value_loss = use_clipped_value_loss
 
-    def test_mode(self):
-        self.actor.test()
-        self.critic.test()
-
     def switch_to_train(self):
         self.actor.train()
         self.critic.train()
@@ -65,30 +61,27 @@ def switch_to_train(self):
     def act(self, obs):
         return self.actor.act(obs).detach()
 
-    def update(self, data, last_obs=None):
-        if last_obs is None:
-            last_values = None
-        else:
-            with torch.no_grad():
-                last_values = self.critic.evaluate(last_obs).detach()
-        compute_generalized_advantages(
-            data, self.gamma, self.lam, self.critic, last_values
+    def update(self, data):
+        data["values"] = self.critic.evaluate(data["critic_obs"])
+        data["advantages"] = compute_generalized_advantages(
+            data, self.gamma, self.lam, self.critic
         )
-
+        data["returns"] = data["advantages"] + data["values"]
         self.update_critic(data)
+        data["advantages"] = normalize(data["advantages"])
         self.update_actor(data)
 
     def update_critic(self, data):
         self.mean_value_loss = 0
         counter = 0
 
-        n, m = data.shape
-        total_data = n * m
-        batch_size = total_data // self.num_mini_batches
-        generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
+        generator = create_uniform_generator(
+            data,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
         for batch in generator:
-            value_batch = self.critic.evaluate(batch["critic_obs"])
-            value_loss = self.critic.loss_fn(value_batch, batch["returns"])
+            value_loss = self.critic.loss_fn(batch["critic_obs"], batch["returns"])
             self.critic_optimizer.zero_grad()
             value_loss.backward()
             nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
@@ -98,8 +91,6 @@ def update_critic(self, data):
         self.mean_value_loss /= counter
 
     def update_actor(self, data):
-        # already done before
-        # compute_generalized_advantages(data, self.gamma, self.lam, self.critic)
         self.mean_surrogate_loss = 0
         counter = 0
 
@@ -110,16 +101,12 @@ def update_actor(self, data):
             data["actions"]
         ).detach()
 
-        n, m = data.shape
-        total_data = n * m
-        batch_size = total_data // self.num_mini_batches
-        generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
+        generator = create_uniform_generator(
+            data,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
         for batch in generator:
-            # * Re-sample noise for smooth actor
-            if isinstance(self.actor, SmoothActor):
-                self.actor.sample_weights(batch_size)
-
-            # ! refactor how this is done
             self.actor.update_distribution(batch["actor_obs"])
             actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"])
             mu_batch = self.actor.action_mean
@@ -171,10 +158,7 @@ def update_actor(self, data):
             # * Gradient step
             self.optimizer.zero_grad()
             loss.backward()
-            nn.utils.clip_grad_norm_(
-                list(self.actor.parameters()) + list(self.critic.parameters()),
-                self.max_grad_norm,
-            )
+            nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
             self.optimizer.step()
             self.mean_surrogate_loss += surrogate_loss.item()
             counter += 1
diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py
index cd73215d..000badec 100644
--- a/learning/algorithms/ppo_ipg.py
+++ b/learning/algorithms/ppo_ipg.py
@@ -5,9 +5,9 @@
 from learning.utils import (
     create_uniform_generator,
     compute_generalized_advantages,
+    normalize,
     polyak_update,
 )
-from learning.modules import SmoothActor
 
 
 class PPO_IPG:
@@ -17,8 +17,8 @@ def __init__(
         critic_v,
         critic_q,
         target_critic_q,
-        num_learning_epochs=1,
-        num_mini_batches=1,
+        batch_size=2**15,
+        max_gradient_steps=10,
         clip_param=0.2,
         gamma=0.998,
         lam=0.95,
@@ -28,13 +28,13 @@ def __init__(
         use_clipped_value_loss=True,
         schedule="fixed",
         desired_kl=0.01,
-        lr_range=[1e-4, 1e-2],
-        lr_ratio=1.3,
         polyak=0.995,
         use_cv=False,
         inter_nu=0.2,
         beta="off_policy",
         device="cpu",
+        lr_range=[1e-4, 1e-2],
+        lr_ratio=1.3,
         **kwargs,
     ):
         self.device = device
@@ -63,8 +63,8 @@ def __init__(
 
         # * PPO parameters
         self.clip_param = clip_param
-        self.num_learning_epochs = num_learning_epochs
-        self.num_mini_batches = num_mini_batches
+        self.batch_size = batch_size
+        self.max_gradient_steps = max_gradient_steps
         self.entropy_coef = entropy_coef
         self.gamma = gamma
         self.lam = lam
@@ -87,14 +87,16 @@ def act(self, obs):
 
     def update(self, data_onpol, data_offpol, last_obs=None):
         # On-policy GAE
-        if last_obs is None:
-            last_values = None
-        else:
-            with torch.no_grad():
-                last_values = self.critic_v.evaluate(last_obs).detach()
-        compute_generalized_advantages(
-            data_onpol, self.gamma, self.lam, self.critic_v, last_values
+        values = self.critic_v.evaluate(data_onpol["critic_obs"])
+        # Handle single env case
+        if values.dim() == 1:
+            values = values.unsqueeze(-1)
+        data_onpol["values"] = values
+        data_onpol["advantages"] = compute_generalized_advantages(
+            data_onpol, self.gamma, self.lam, self.critic_v
         )
+        data_onpol["returns"] = data_onpol["advantages"] + data_onpol["values"]
+        data_onpol["advantages"] = normalize(data_onpol["advantages"])
 
         self.update_critic_q(data_offpol)
 
@@ -116,10 +118,11 @@ def update_critic_q(self, data):
         self.mean_q_loss = 0
         counter = 0
 
-        n, m = data.shape
-        total_data = n * m
-        batch_size = total_data // self.num_mini_batches
-        generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
+        generator = create_uniform_generator(
+            data,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
         for batch in generator:
             with torch.no_grad():
                 # TODO: check that should be inference
@@ -133,8 +136,7 @@ def update_critic_q(self, data):
                     + self.gamma * batch["dones"].logical_not() * q_next
                 )
             q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1)
-            q_value = self.critic_q.evaluate(q_input)
-            q_loss = self.critic_q.loss_fn(q_value, q_target)
+            q_loss = self.critic_q.loss_fn(q_input, q_target)
             self.critic_q_optimizer.zero_grad()
             q_loss.backward()
             nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm)
@@ -152,13 +154,13 @@ def update_critic_v(self, data):
         self.mean_value_loss = 0
         counter = 0
 
-        n, m = data.shape
-        total_data = n * m
-        batch_size = total_data // self.num_mini_batches
-        generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
+        generator = create_uniform_generator(
+            data,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
         for batch in generator:
-            value_batch = self.critic_v.evaluate(batch["critic_obs"])
-            value_loss = self.critic_v.loss_fn(value_batch, batch["returns"])
+            value_loss = self.critic_v.loss_fn(batch["critic_obs"], batch["returns"])
             self.critic_v_optimizer.zero_grad()
             value_loss.backward()
             nn.utils.clip_grad_norm_(self.critic_v.parameters(), self.max_grad_norm)
@@ -180,17 +182,12 @@ def update_actor(self, data_onpol, data_offpol):
         ).detach()
 
         # Generate off-policy batches and use all on-policy data
-        n, m = data_offpol.shape
-        total_data = n * m
-        batch_size = total_data // self.num_mini_batches
         generator_offpol = create_uniform_generator(
-            data_offpol, batch_size, self.num_learning_epochs
+            data_offpol,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
         )
         for batch_offpol in generator_offpol:
-            # * Re-sample noise for smooth actor
-            if isinstance(self.actor, SmoothActor):
-                self.actor.sample_weights(batch_size)
-
             self.actor.update_distribution(data_onpol["actor_obs"])
             actions_log_prob_onpol = self.actor.get_actions_log_prob(
                 data_onpol["actions"]
diff --git a/learning/modules/critic.py b/learning/modules/critic.py
index 3ee930dc..96c53438 100644
--- a/learning/modules/critic.py
+++ b/learning/modules/critic.py
@@ -15,16 +15,19 @@ def __init__(
     ):
         super().__init__()
 
-        self.NN = create_MLP(num_obs, 1, hidden_dims, activation, latent=False)
+        self.NN = create_MLP(num_obs, 1, hidden_dims, activation)
         self._normalize_obs = normalize_obs
         if self._normalize_obs:
             self.obs_rms = RunningMeanStd(num_obs)
 
-    def evaluate(self, critic_observations):
+    def forward(self, x):
         if self._normalize_obs:
             with torch.no_grad():
-                critic_observations = self.obs_rms(critic_observations)
-        return self.NN(critic_observations).squeeze()
+                x = self.obs_rms(x)
+        return self.NN(x).squeeze()
+
+    def evaluate(self, critic_observations):
+        return self.forward(critic_observations)
 
-    def loss_fn(self, input, target):
-        return nn.functional.mse_loss(input, target, reduction="mean")
+    def loss_fn(self, obs, target):
+        return nn.functional.mse_loss(self.forward(obs), target, reduction="mean")
diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py
index ecc8a39b..bb8bb044 100644
--- a/learning/runners/ipg_runner.py
+++ b/learning/runners/ipg_runner.py
@@ -6,9 +6,9 @@
 from learning.utils import Logger
 
 from .BaseRunner import BaseRunner
+from learning.algorithms import *  # noqa: F403
 from learning.modules import Actor, SmoothActor, Critic
 from learning.storage import DictStorage, ReplayBuffer
-from learning.algorithms import PPO_IPG
 
 logger = Logger()
 storage_onpol = DictStorage()
@@ -26,10 +26,11 @@ def __init__(self, env, train_cfg, device="cpu"):
         )
 
     def _set_up_alg(self):
-        alg_class = eval(self.cfg["algorithm_class_name"])
-        if alg_class != PPO_IPG:
+        alg_class_name = self.cfg["algorithm_class_name"]
+        if alg_class_name != "PPO_IPG":
             raise ValueError("IPGRunner only supports PPO_IPG")
 
+        alg_class = eval(alg_class_name)
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
         num_actions = self.get_action_size(self.actor_cfg["actions"])
         num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
@@ -170,7 +171,7 @@ def learn(self):
         self.save()
 
     @torch.no_grad
-    def burn_in_normalization(self, n_iterations=200):
+    def burn_in_normalization(self, n_iterations=100):
         actor_obs = self.get_obs(self.actor_cfg["obs"])
         critic_obs = self.get_obs(self.critic_cfg["obs"])
         for _ in range(n_iterations):
@@ -181,11 +182,12 @@ def burn_in_normalization(self, n_iterations=200):
                 self.actor_cfg["obs"], self.actor_cfg["noise"]
             )
             critic_obs = self.get_obs(self.critic_cfg["obs"])
-            self.alg.critic_v.evaluate(critic_obs)
+            # TODO: Check this, seems to perform better without critic eval
+            # self.alg.critic_v.evaluate(critic_obs)
             q_input = torch.cat((critic_obs, actions), dim=-1)
             self.alg.critic_q.evaluate(q_input)
             self.alg.target_critic_q.evaluate(q_input)
-        # self.env.reset()
+        self.env.reset()
 
     def update_rewards(self, rewards_dict, terminated):
         rewards_dict.update(
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index e5f000f8..c98d088c 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -41,8 +41,10 @@ def learn(self, states_to_log_dict=None):
         transition.update(
             {
                 "actor_obs": actor_obs,
+                "next_actor_obs": actor_obs,
                 "actions": self.alg.act(actor_obs),
                 "critic_obs": critic_obs,
+                "next_critic_obs": critic_obs,
                 "rewards": self.get_rewards({"termination": 0.0})["termination"],
                 "dones": self.get_timed_out(),
             }
@@ -62,6 +64,13 @@ def learn(self, states_to_log_dict=None):
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
+
+            # * Simulate environment and log states
+            if states_to_log_dict is not None:
+                it_idx = self.it - 1
+                if it_idx % 10 == 0:
+                    self.sim_and_log_states(states_to_log_dict, it_idx)
+
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
@@ -102,6 +111,8 @@ def learn(self, states_to_log_dict=None):
 
                     transition.update(
                         {
+                            "next_actor_obs": actor_obs,
+                            "next_critic_obs": critic_obs,
                             "rewards": total_rewards,
                             "timed_out": timed_out,
                             "dones": dones,
@@ -130,7 +141,7 @@ def learn(self, states_to_log_dict=None):
         self.save()
 
     @torch.no_grad
-    def burn_in_normalization(self, n_iterations=200):
+    def burn_in_normalization(self, n_iterations=100):
         actor_obs = self.get_obs(self.actor_cfg["obs"])
         # critic_obs = self.get_obs(self.critic_cfg["obs"])
         for _ in range(n_iterations):
@@ -140,10 +151,10 @@ def burn_in_normalization(self, n_iterations=200):
             actor_obs = self.get_noisy_obs(
                 self.actor_cfg["obs"], self.actor_cfg["noise"]
             )
-            # TODO: check this, trains better without evaluating critic
+            # TODO: Check this, seems to perform better without critic eval
             # critic_obs = self.get_obs(self.critic_cfg["obs"])
             # self.alg.critic.evaluate(critic_obs)
-        # self.env.reset()
+        self.env.reset()
 
     def update_rewards(self, rewards_dict, terminated):
         rewards_dict.update(
@@ -170,7 +181,9 @@ def set_up_logger(self):
             self.alg,
             ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
         )
-        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
+        logger.register_category(
+            "actor", self.alg.actor, ["action_std", "entropy", "get_std"]
+        )
 
         logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 
@@ -209,3 +222,42 @@ def get_inference_actions(self):
 
     def export(self, path):
         self.alg.actor.export(path)
+
+    def sim_and_log_states(self, states_to_log_dict, it_idx):
+        # Simulate environment for as many steps as expected in the dict.
+        # Log states to the dict, as well as whether the env terminated.
+        steps = states_to_log_dict["terminated"].shape[2]
+        actor_obs = self.get_obs(self.policy_cfg["actor_obs"])
+        critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
+
+        with torch.inference_mode():
+            for i in range(steps):
+                sample_freq = self.policy_cfg["exploration_sample_freq"]
+                if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0:
+                    self.alg.actor_critic.actor.sample_weights(
+                        batch_size=self.env.num_envs
+                    )
+
+                actions = self.alg.act(actor_obs, critic_obs)
+                self.set_actions(
+                    self.policy_cfg["actions"],
+                    actions,
+                    self.policy_cfg["disable_actions"],
+                )
+
+                self.env.step()
+
+                actor_obs = self.get_noisy_obs(
+                    self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
+                )
+                critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
+
+                # Log states (just for the first env)
+                terminated = self.get_terminated()[0]
+                for state in states_to_log_dict:
+                    if state == "terminated":
+                        states_to_log_dict[state][0, it_idx, i, :] = terminated
+                    else:
+                        states_to_log_dict[state][0, it_idx, i, :] = getattr(
+                            self.env, state
+                        )[0, :]
diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py
index 2d19e072..99534c22 100644
--- a/learning/utils/dict_utils.py
+++ b/learning/utils/dict_utils.py
@@ -9,30 +9,29 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None):
     else:
         last_values = critic.evaluate(data["critic_obs"][-1])
 
-    data.update({"returns": torch.zeros_like(data["rewards"])})
-    data["returns"][-1] = last_values * ~data["dones"][-1]
+    returns = torch.zeros_like(data["rewards"])
+    returns[-1] = last_values * ~data["dones"][-1]
     for k in reversed(range(data["rewards"].shape[0] - 1)):
         not_done = ~data["dones"][k]
-        data["returns"][k] = (
-            data["rewards"][k] + gamma * data["returns"][k + 1] * not_done
-        )
-    data["returns"] = (data["returns"] - data["returns"].mean()) / (
-        data["returns"].std() + 1e-8
-    )
-    return
+        returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done
+
+    return normalize(returns)
 
 
 @torch.no_grad
-def compute_generalized_advantages(data, gamma, lam, critic, last_values=None):
-    data.update({"values": critic.evaluate(data["critic_obs"])})
+def normalize(input, eps=1e-8):
+    return (input - input.mean()) / (input.std() + eps)
 
-    data.update({"advantages": torch.zeros_like(data["values"])})
 
+@torch.no_grad
+def compute_generalized_advantages(data, gamma, lam, critic):
+    last_values = critic.evaluate(data["next_critic_obs"][-1])
+    advantages = torch.zeros_like(data["values"])
     if last_values is not None:
         # todo check this
         # since we don't have observations for the last step, need last value plugged in
         not_done = ~data["dones"][-1]
-        data["advantages"][-1] = (
+        advantages[-1] = (
             data["rewards"][-1]
             + gamma * data["values"][-1] * data["timed_out"][-1]
             + gamma * last_values * not_done
@@ -47,23 +46,30 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None):
             + gamma * data["values"][k + 1] * not_done
             - data["values"][k]
         )
-        data["advantages"][k] = (
-            td_error + gamma * lam * not_done * data["advantages"][k + 1]
-        )
+        advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1]
 
-    data["returns"] = data["advantages"] + data["values"]
-
-    data["advantages"] = (data["advantages"] - data["advantages"].mean()) / (
-        data["advantages"].std() + 1e-8
-    )
+    return advantages
 
 
 # todo change num_epochs to num_batches
 @torch.no_grad
-def create_uniform_generator(data, batch_size, num_epochs, keys=None):
+def create_uniform_generator(
+    data, batch_size, num_epochs=1, max_gradient_steps=None, keys=None
+):
     n, m = data.shape
     total_data = n * m
+
+    if batch_size > total_data:
+        Warning("Batch size is larger than total data, using available data only.")
+        batch_size = total_data
+
     num_batches_per_epoch = total_data // batch_size
+    if max_gradient_steps:
+        if max_gradient_steps < num_batches_per_epoch:
+            num_batches_per_epoch = max_gradient_steps
+        num_epochs = max_gradient_steps // num_batches_per_epoch
+        num_epochs = max(num_epochs, 1)
+
     for epoch in range(num_epochs):
         indices = torch.randperm(total_data, device=data.device)
         for i in range(num_batches_per_epoch):
diff --git a/scripts/export_policy.py b/scripts/export_policy.py
index a5897359..d591ea14 100644
--- a/scripts/export_policy.py
+++ b/scripts/export_policy.py
@@ -22,6 +22,7 @@ def setup_and_export(args):
         LEGGED_GYM_ROOT_DIR,
         "logs",
         train_cfg.runner.experiment_name,
+        train_cfg.runner.load_run,
         "exported",
     )
     runner.export(path)

From 5ce0d6e4c2d44425ee28539d36e7458253b32c0c Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 10 Jul 2024 20:14:34 -0400
Subject: [PATCH 24/42] finetune using robot-software logs

---
 gym/envs/mini_cheetah/minimalist_cheetah.py |  71 +++++++++
 learning/runners/finetune_runner.py         |  73 +++++++++
 scripts/finetune.py                         | 159 ++++++++++++++++++++
 3 files changed, 303 insertions(+)
 create mode 100644 gym/envs/mini_cheetah/minimalist_cheetah.py
 create mode 100644 learning/runners/finetune_runner.py
 create mode 100644 scripts/finetune.py

diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py
new file mode 100644
index 00000000..8da8b9d9
--- /dev/null
+++ b/gym/envs/mini_cheetah/minimalist_cheetah.py
@@ -0,0 +1,71 @@
+import torch
+
+
+class MinimalistCheetah:
+    """
+    Helper class for computing mini cheetah rewards
+    """
+
+    def __init__(self, device="cpu", tracking_sigma=0.25):
+        self.device = device
+        self.tracking_sigma = tracking_sigma
+
+    def set_states(
+        self,
+        base_height,
+        base_lin_vel,
+        base_ang_vel,
+        proj_gravity,
+        commands,
+        phase_obs,
+        grf,
+    ):
+        # Unsqueeze so first dim is batch_size
+        self.base_height = torch.tensor(base_height, device=self.device).unsqueeze(0)
+        self.base_lin_vel = torch.tensor(base_lin_vel, device=self.device).unsqueeze(0)
+        self.base_ang_vel = torch.tensor(base_ang_vel, device=self.device).unsqueeze(0)
+        self.proj_gravity = torch.tensor(proj_gravity, device=self.device).unsqueeze(0)
+        self.commands = torch.tensor(commands, device=self.device).unsqueeze(0)
+        self.grf = torch.tensor(grf, device=self.device).unsqueeze(0)
+
+        phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0)
+        self.phase_sin = torch.sin(phase_obs)
+
+    def _switch(self):
+        c_vel = torch.linalg.norm(self.commands, dim=1)
+        return torch.exp(
+            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1
+        )
+
+    def _reward_tracking_lin_vel(self):
+        """Tracking of linear velocity commands (xy axes)"""
+        # just use lin_vel?
+        error = self.commands[:, :2] - self.base_lin_vel[:, :2]
+        # * scale by (1+|cmd|): if cmd=0, no scaling.
+        error *= 1.0 / (1.0 + torch.abs(self.commands[:, :2]))
+        error = torch.sum(torch.square(error), dim=1)
+        return torch.exp(-error / self.tracking_sigma)
+
+    def _reward_tracking_ang_vel(self):
+        """Tracking of angular velocity commands (yaw)"""
+        ang_vel_error = torch.square(self.commands[:, 2] - self.base_ang_vel[:, 2])
+        return torch.exp(-ang_vel_error / self.tracking_sigma)
+
+    def _reward_orientation(self):
+        """Penalize non-flat base orientation"""
+        error = torch.square(self.proj_gravity[:, :2]) / self.tracking_sigma
+        return torch.sum(torch.exp(-error), dim=1)
+
+    def _reward_swing_grf(self):
+        """Reward non-zero grf during swing (0 to pi)"""
+        in_contact = torch.gt(torch.norm(self.grf, dim=-1), 50.0)
+        ph_off = torch.gt(self.phase_sin, 0)  # phase <= pi
+        rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
+        return -torch.sum(rew.float(), dim=1) * (1 - self._switch())
+
+    def _reward_stance_grf(self):
+        """Reward non-zero grf during stance (pi to 2pi)"""
+        in_contact = torch.gt(torch.norm(self.grf, dim=-1), 50.0)
+        ph_off = torch.lt(self.phase_sin, 0)  # phase >= pi
+        rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
+        return torch.sum(rew.float(), dim=1) * (1 - self._switch())
diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py
new file mode 100644
index 00000000..deace319
--- /dev/null
+++ b/learning/runners/finetune_runner.py
@@ -0,0 +1,73 @@
+import torch
+from learning.algorithms import *  # noqa: F403
+from learning.modules import Actor, SmoothActor, Critic
+from learning.utils import remove_zero_weighted_rewards
+
+
+class FineTuneRunner:
+    def __init__(self, train_cfg, data_dict, device="cpu"):
+        self.parse_train_cfg(train_cfg)
+        self.data_dict = data_dict
+        self.device = device
+
+    def _set_up_alg(self):
+        num_actor_obs = self.data_dict["actor_obs"].shape[-1]
+        num_actions = self.data_dict["actions"].shape[-1]
+        num_critic_obs = self.data_dict["critic_obs"].shape[-1]
+        if self.actor_cfg["smooth_exploration"]:
+            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+        else:
+            actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        critic = Critic(num_critic_obs, **self.critic_cfg)
+        alg_class = eval(self.cfg["algorithm_class_name"])
+        self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
+
+    def parse_train_cfg(self, train_cfg):
+        self.cfg = train_cfg["runner"]
+        self.alg_cfg = train_cfg["algorithm"]
+        remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"])
+        self.actor_cfg = train_cfg["actor"]
+        self.critic_cfg = train_cfg["critic"]
+
+    def learn(self):
+        # Single update on data dict
+        self.alg.update(self.data_dict)
+
+    def save(self, path):
+        torch.save(
+            {
+                "actor_state_dict": self.alg.actor.state_dict(),
+                "critic_state_dict": self.alg.critic.state_dict(),
+                "optimizer_state_dict": self.alg.optimizer.state_dict(),
+                "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(),
+                "iter": 1,  # only one iteration
+            },
+            path,
+        )
+
+    def load(self, path, load_optimizer=True):
+        loaded_dict = torch.load(path)
+        self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"])
+        self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"])
+        if load_optimizer:
+            self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
+            self.alg.critic_optimizer.load_state_dict(
+                loaded_dict["critic_optimizer_state_dict"]
+            )
+
+    def export(self, path):
+        # Need to make a copy of actor
+        if self.actor_cfg["smooth_exploration"]:
+            actor_copy = SmoothActor(
+                self.alg.actor.num_obs, self.alg.actor.num_actions, **self.actor_cfg
+            )
+        else:
+            actor_copy = Actor(
+                self.alg.actor.num_obs, self.alg.actor.num_actions, **self.actor_cfg
+            )
+        state_dict = {
+            name: param.detach().clone()
+            for name, param in self.alg.actor.state_dict().items()
+        }
+        actor_copy.load_state_dict(state_dict)
+        actor_copy.export(path)
diff --git a/scripts/finetune.py b/scripts/finetune.py
new file mode 100644
index 00000000..63c55328
--- /dev/null
+++ b/scripts/finetune.py
@@ -0,0 +1,159 @@
+from gym.envs import __init__  # noqa: F401
+from gym.utils import get_args, task_registry
+from gym.utils.helpers import class_to_dict
+
+from learning.algorithms import *  # noqa: F403
+from learning.runners.finetune_runner import FineTuneRunner
+from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah
+
+from gym import LEGGED_GYM_ROOT_DIR
+
+import os
+import torch
+import scipy.io
+from tensordict import TensorDict
+
+ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
+LOAD_RUN = "Jul08_13-02-15_PPO32_S16"
+LOG_FILE = "PPO32_S16_expl.mat"
+ITERATION = 500
+
+# Data struct fields from Robot-Software logs
+DATA_LIST = [
+    "header",
+    "base_height",  # shape (1, batch_size)
+    # the following are all shape (batch_size, n)
+    "base_lin_vel",
+    "base_ang_vel",
+    "projected_gravity",
+    "commands",
+    "dof_pos_obs",
+    "dof_vel",
+    "phase_obs",
+    "grf",
+    "dof_pos_target",
+    "exploration_noise",
+    "footer",
+]
+
+# Finetune for these rewards instead of the ones in the cfg
+REWARD_WEIGHTS = {
+    # "tracking_lin_vel": 4.0,
+    "tracking_ang_vel": 2.0,
+    "orientation": 1.0,
+    "swing_grf": 1.5,
+    "stance_grf": 1.5,
+}
+
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def get_obs(obs_list, data_struct, batch_size):
+    obs_all = torch.empty(0).to(DEVICE)
+    for obs_name in obs_list:
+        data_idx = DATA_LIST.index(obs_name)
+        obs = torch.tensor(data_struct[data_idx]).to(DEVICE)
+        obs = obs.reshape((batch_size, 1, -1))  # shape (batch_size, 1, n)
+        obs_all = torch.cat((obs_all, obs), dim=-1)
+
+    return obs_all.float()
+
+
+def get_rewards(data_struct, batch_size):
+    rewards_all = torch.empty(0).to(DEVICE)
+    for i in range(batch_size):
+        mini_cheetah = MinimalistCheetah(device=DEVICE)
+        mini_cheetah.set_states(
+            base_height=data_struct[1][:, i],  # shape (1, batch_size)
+            base_lin_vel=data_struct[2][i],
+            base_ang_vel=data_struct[3][i],
+            proj_gravity=data_struct[4][i],
+            commands=data_struct[5][i],
+            phase_obs=data_struct[8][i],
+            grf=data_struct[9][i],
+        )
+        total_rewards = 0
+        for name, weight in REWARD_WEIGHTS.items():
+            reward = eval(f"mini_cheetah._reward_{name}()")
+            total_rewards += weight * reward
+        rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
+
+    return rewards_all.float()
+
+
+def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"):
+    path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE)
+    data = scipy.io.loadmat(path)
+    data_struct = data[name][0][0]
+    batch_size = data_struct[1].shape[1]  # base_height: shape (1, batch_size)
+    data_dict = TensorDict(
+        {},
+        device=DEVICE,
+        batch_size=(batch_size - 1, 1),  # -1 for next_obs
+    )
+
+    actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, batch_size)
+    critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, batch_size)
+
+    data_dict["actor_obs"] = actor_obs[:-1]
+    data_dict["next_actor_obs"] = actor_obs[1:]
+    data_dict["critic_obs"] = critic_obs[:-1]
+    data_dict["next_critic_obs"] = critic_obs[1:]
+
+    actions_idx = DATA_LIST.index("dof_pos_target")
+    actions = torch.tensor(data_struct[actions_idx]).to(DEVICE)
+    actions = actions.reshape((batch_size, 1, -1))  # shape (batch_size, 1, n)
+    data_dict["actions"] = actions[:-1]
+
+    rewards = get_rewards(data_struct, batch_size)
+    rewards = rewards.reshape((batch_size, 1))  # shape (batch_size, 1)
+    data_dict["rewards"] = rewards[:-1]
+
+    # Assume no termination
+    data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
+    data_dict["terminal"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
+    data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
+
+    return data_dict
+
+
+def setup():
+    args = get_args()
+
+    _, train_cfg = task_registry.create_cfgs(args)
+    train_cfg = class_to_dict(train_cfg)
+    data_dict = get_data_dict(train_cfg)
+
+    runner = FineTuneRunner(train_cfg, data_dict, device=DEVICE)
+    runner._set_up_alg()
+
+    return runner
+
+
+def finetune(runner):
+    load_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION) + ".pt")
+    print("Loading model from: ", load_path)
+    runner.load(load_path)
+
+    # Perform a single update
+    runner.learn()
+
+    # Compare old and new actions
+    scaling = torch.tensor([0.2, 0.3, 0.3], device=DEVICE).repeat(4)
+    actions_old = runner.data_dict["actions"]
+    actions_new = scaling * runner.alg.actor.act_inference(
+        runner.data_dict["actor_obs"]
+    )
+    diff = actions_new - actions_old
+    print("Mean action diff per actuator: ", diff.mean(dim=0))
+
+    save_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION + 1) + ".pt")
+    runner.save(save_path)
+
+    # export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_finetuned")
+    # runner.export(export_path)
+
+
+if __name__ == "__main__":
+    runner = setup()
+    finetune(runner)

From cb75936cb925febed68959bd285ed8ce29970680 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 10 Jul 2024 20:16:13 -0400
Subject: [PATCH 25/42] handle single env dimension at critic

---
 learning/algorithms/ppo_ipg.py       | 8 ++------
 learning/modules/critic.py           | 2 +-
 learning/modules/utils/neural_net.py | 8 +++++---
 learning/modules/utils/normalize.py  | 2 +-
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py
index 000badec..e377579f 100644
--- a/learning/algorithms/ppo_ipg.py
+++ b/learning/algorithms/ppo_ipg.py
@@ -85,13 +85,9 @@ def switch_to_train(self):
     def act(self, obs):
         return self.actor.act(obs).detach()
 
-    def update(self, data_onpol, data_offpol, last_obs=None):
+    def update(self, data_onpol, data_offpol):
         # On-policy GAE
-        values = self.critic_v.evaluate(data_onpol["critic_obs"])
-        # Handle single env case
-        if values.dim() == 1:
-            values = values.unsqueeze(-1)
-        data_onpol["values"] = values
+        data_onpol["values"] = self.critic_v.evaluate(data_onpol["critic_obs"])
         data_onpol["advantages"] = compute_generalized_advantages(
             data_onpol, self.gamma, self.lam, self.critic_v
         )
diff --git a/learning/modules/critic.py b/learning/modules/critic.py
index 96c53438..f2e8678f 100644
--- a/learning/modules/critic.py
+++ b/learning/modules/critic.py
@@ -24,7 +24,7 @@ def forward(self, x):
         if self._normalize_obs:
             with torch.no_grad():
                 x = self.obs_rms(x)
-        return self.NN(x).squeeze()
+        return self.NN(x).squeeze(-1)
 
     def evaluate(self, critic_observations):
         return self.forward(critic_observations)
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 6e6f41d6..d452da57 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -1,6 +1,7 @@
 import torch
 import os
-import copy
+
+# import copy
 import numpy as np
 
 
@@ -78,7 +79,8 @@ def export_network(network, network_name, path, num_inputs, latent=True):
     os.makedirs(path, exist_ok=True)
     path_TS = os.path.join(path, network_name + ".pt")  # TorchScript path
     path_onnx = os.path.join(path, network_name + ".onnx")  # ONNX path
-    model = copy.deepcopy(network).to("cpu")
+    # model = copy.deepcopy(network).to("cpu")
+    model = network.to("cpu")  # no deepcopy
     # To trace model, must be evaluated once with arbitrary input
     model.eval()
     dummy_input = torch.rand((num_inputs))
@@ -98,5 +100,5 @@ def export_network(network, network_name, path, num_inputs, latent=True):
         # Save actor std of shape (num_actions, latent_dim)
         # It is important that the shape is the same as the exploration matrix
         path_std = os.path.join(path, network_name + "_std.txt")
-        std_transposed = model.get_std.numpy().T
+        std_transposed = model.get_std.detach().numpy().T
         np.savetxt(path_std, std_transposed)
diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py
index 246bafa4..0a60dcbf 100644
--- a/learning/modules/utils/normalize.py
+++ b/learning/modules/utils/normalize.py
@@ -41,7 +41,7 @@ def _update_mean_var_from_moments(
     def forward(self, input):
         if self.training:
             mean = input.mean(tuple(range(input.dim() - 1)))
-            var = input.var(tuple(range(input.dim() - 1)))
+            var = torch.nan_to_num(input.var(tuple(range(input.dim() - 1))))
             (
                 self.running_mean,
                 self.running_var,

From f5829f079c66c66bb3c4d8f362e184c16ed06311 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Thu, 11 Jul 2024 14:25:42 -0400
Subject: [PATCH 26/42] update finetuning: exploration scale, grf rewards,
 plotting

---
 .../mini_cheetah/mini_cheetah_ref_config.py   |  9 ++-
 gym/envs/mini_cheetah/minimalist_cheetah.py   | 14 ++--
 learning/modules/smooth_actor.py              | 22 +++---
 learning/runners/finetune_runner.py           | 10 ++-
 scripts/finetune.py                           | 79 +++++++++++++------
 5 files changed, 89 insertions(+), 45 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 30e614b0..09970e66 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -67,7 +67,7 @@ class scaling(MiniCheetahCfg.scaling):
 
 class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg):
     seed = -1
-    runner_class_name = "IPGRunner"
+    runner_class_name = "OnPolicyRunner"
 
     class actor:
         hidden_dims = [256, 256, 128]
@@ -150,9 +150,14 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         beta = "off_policy"
         storage_size = 4 * 32 * 4096  # num_policies*num_stpes*num_envs
 
+        # Finetuning
+        learning_rate = 3e-4
+        max_gradient_steps = 10
+        batch_size = 4096
+
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
         max_iterations = 700  # number of policy updates
-        algorithm_class_name = "PPO_IPG"
+        algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py
index 8da8b9d9..658bb9a3 100644
--- a/gym/envs/mini_cheetah/minimalist_cheetah.py
+++ b/gym/envs/mini_cheetah/minimalist_cheetah.py
@@ -29,7 +29,7 @@ def set_states(
         self.grf = torch.tensor(grf, device=self.device).unsqueeze(0)
 
         phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0)
-        self.phase_sin = torch.sin(phase_obs)
+        self.phase_sin = phase_obs[:, 0].unsqueeze(0)
 
     def _switch(self):
         c_vel = torch.linalg.norm(self.commands, dim=1)
@@ -56,16 +56,16 @@ def _reward_orientation(self):
         error = torch.square(self.proj_gravity[:, :2]) / self.tracking_sigma
         return torch.sum(torch.exp(-error), dim=1)
 
-    def _reward_swing_grf(self):
+    def _reward_swing_grf(self, contact_thresh=0.5):
         """Reward non-zero grf during swing (0 to pi)"""
-        in_contact = torch.gt(torch.norm(self.grf, dim=-1), 50.0)
+        in_contact = torch.gt(self.grf, contact_thresh)
         ph_off = torch.gt(self.phase_sin, 0)  # phase <= pi
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
-        return -torch.sum(rew.float(), dim=1) * (1 - self._switch())
+        return -torch.sum(rew.float(), dim=1)  # * (1 - self._switch())
 
-    def _reward_stance_grf(self):
+    def _reward_stance_grf(self, contact_thresh=0.5):
         """Reward non-zero grf during stance (pi to 2pi)"""
-        in_contact = torch.gt(torch.norm(self.grf, dim=-1), 50.0)
+        in_contact = torch.gt(self.grf, contact_thresh)
         ph_off = torch.lt(self.phase_sin, 0)  # phase >= pi
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
-        return torch.sum(rew.float(), dim=1) * (1 - self._switch())
+        return torch.sum(rew.float(), dim=1)  # * (1 - self._switch())
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index 5cb1da89..72c4ace1 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -12,8 +12,9 @@
 # https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
 class SmoothActor(Actor):
     weights_dist: Normal
-    _latent_sde: torch.Tensor
+    latent_sde: torch.Tensor
     exploration_matrices: torch.Tensor
+    exploration_scale: float
 
     def __init__(
         self,
@@ -23,6 +24,7 @@ def __init__(
         learn_features: bool = True,
         epsilon: float = 1e-6,
         log_std_init: float = 0.0,
+        exploration_scale: float = 1.0,
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
@@ -31,6 +33,7 @@ def __init__(
         self.learn_features = learn_features
         self.epsilon = epsilon
         self.log_std_init = log_std_init
+        self.exploration_scale = exploration_scale  # for finetuning
 
         # Create latent NN and last layer
         self.latent_net = create_MLP(
@@ -88,16 +91,17 @@ def update_distribution(self, observations):
             with torch.no_grad():
                 observations = self.obs_rms(observations)
         # Get latent features and compute distribution
-        self._latent_sde = self.latent_net(observations)
+        self.latent_sde = self.latent_net(observations)
+        std_scaled = self.get_std * self.exploration_scale
         if not self.learn_features:
-            self._latent_sde = self._latent_sde.detach()
-        if self._latent_sde.dim() == 2:
-            variance = torch.mm(self._latent_sde**2, self.get_std**2)
-        elif self._latent_sde.dim() == 3:
-            variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2)
+            self.latent_sde = self.latent_sde.detach()
+        if self.latent_sde.dim() == 2:
+            variance = torch.mm(self.latent_sde**2, std_scaled**2)
+        elif self.latent_sde.dim() == 3:
+            variance = torch.einsum("abc,cd->abd", self.latent_sde**2, std_scaled**2)
         else:
             raise ValueError("Invalid latent_sde dimension")
-        mean_actions = self.mean_actions_net(self._latent_sde)
+        mean_actions = self.mean_actions_net(self.latent_sde)
         self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
 
     def act(self, observations):
@@ -118,7 +122,7 @@ def act_inference(self, observations):
         return mean_actions
 
     def get_noise(self):
-        latent_sde = self._latent_sde
+        latent_sde = self.latent_sde
         if not self.learn_features:
             latent_sde = latent_sde.detach()
         # Use batch matrix multiplication for efficient computation
diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py
index deace319..4cfdf895 100644
--- a/learning/runners/finetune_runner.py
+++ b/learning/runners/finetune_runner.py
@@ -5,9 +5,10 @@
 
 
 class FineTuneRunner:
-    def __init__(self, train_cfg, data_dict, device="cpu"):
+    def __init__(self, train_cfg, data_dict, exploration_scale=1.0, device="cpu"):
         self.parse_train_cfg(train_cfg)
         self.data_dict = data_dict
+        self.exploration_scale = exploration_scale
         self.device = device
 
     def _set_up_alg(self):
@@ -15,7 +16,12 @@ def _set_up_alg(self):
         num_actions = self.data_dict["actions"].shape[-1]
         num_critic_obs = self.data_dict["critic_obs"].shape[-1]
         if self.actor_cfg["smooth_exploration"]:
-            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+            actor = SmoothActor(
+                num_obs=num_actor_obs,
+                num_actions=num_actions,
+                exploration_scale=self.exploration_scale,
+                **self.actor_cfg,
+            )
         else:
             actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
         critic = Critic(num_critic_obs, **self.critic_cfg)
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 63c55328..7fc3f270 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -12,17 +12,22 @@
 import torch
 import scipy.io
 from tensordict import TensorDict
+import matplotlib.pyplot as plt
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
 LOAD_RUN = "Jul08_13-02-15_PPO32_S16"
-LOG_FILE = "PPO32_S16_expl.mat"
+LOG_FILE = "PPO32_S16_Jul08_expl08.mat"
+
+TRAJ_LENGTH = 100  # split data into chunks of this length
+EXPLORATION_SCALE = 0.8  # scale std in SmoothActor
 ITERATION = 500
+PLOT = True
 
 # Data struct fields from Robot-Software logs
 DATA_LIST = [
     "header",
-    "base_height",  # shape (1, batch_size)
-    # the following are all shape (batch_size, n)
+    "base_height",  # shape (1, data_length)
+    # the following are all shape (data_length, n)
     "base_lin_vel",
     "base_ang_vel",
     "projected_gravity",
@@ -39,8 +44,8 @@
 # Finetune for these rewards instead of the ones in the cfg
 REWARD_WEIGHTS = {
     # "tracking_lin_vel": 4.0,
-    "tracking_ang_vel": 2.0,
-    "orientation": 1.0,
+    # "tracking_ang_vel": 2.0,
+    # "orientation": 1.0,
     "swing_grf": 1.5,
     "stance_grf": 1.5,
 }
@@ -48,23 +53,27 @@
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
-def get_obs(obs_list, data_struct, batch_size):
+def get_obs(obs_list, data_struct, batches):
     obs_all = torch.empty(0).to(DEVICE)
     for obs_name in obs_list:
         data_idx = DATA_LIST.index(obs_name)
         obs = torch.tensor(data_struct[data_idx]).to(DEVICE)
-        obs = obs.reshape((batch_size, 1, -1))  # shape (batch_size, 1, n)
+        if obs.shape[0] == 1:
+            obs = obs.T  # base_height has shape (1, data_length)
+        obs = obs[: TRAJ_LENGTH * batches, :]
+        obs = obs.reshape((TRAJ_LENGTH, batches, -1))
         obs_all = torch.cat((obs_all, obs), dim=-1)
 
     return obs_all.float()
 
 
-def get_rewards(data_struct, batch_size):
+def get_rewards(data_struct, batches):
+    rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()}  # for plotting
     rewards_all = torch.empty(0).to(DEVICE)
-    for i in range(batch_size):
+    for i in range(TRAJ_LENGTH * batches):
         mini_cheetah = MinimalistCheetah(device=DEVICE)
         mini_cheetah.set_states(
-            base_height=data_struct[1][:, i],  # shape (1, batch_size)
+            base_height=data_struct[1][:, i],  # shape (1, data_length)
             base_lin_vel=data_struct[2][i],
             base_ang_vel=data_struct[3][i],
             proj_gravity=data_struct[4][i],
@@ -75,8 +84,14 @@ def get_rewards(data_struct, batch_size):
         total_rewards = 0
         for name, weight in REWARD_WEIGHTS.items():
             reward = eval(f"mini_cheetah._reward_{name}()")
+            rewards_dict[name].append(reward.item())
             total_rewards += weight * reward
         rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
+    rewards_all = rewards_all.reshape((TRAJ_LENGTH, batches))
+
+    if PLOT:
+        for name, rewards in rewards_dict.items():
+            plt.plot(rewards, label=name)
 
     return rewards_all.float()
 
@@ -85,15 +100,16 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"):
     path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE)
     data = scipy.io.loadmat(path)
     data_struct = data[name][0][0]
-    batch_size = data_struct[1].shape[1]  # base_height: shape (1, batch_size)
+
+    data_length = data_struct[1].shape[1]  # base_height: shape (1, data_length)
+    num_trajs = data_length // TRAJ_LENGTH
     data_dict = TensorDict(
         {},
         device=DEVICE,
-        batch_size=(batch_size - 1, 1),  # -1 for next_obs
+        batch_size=(TRAJ_LENGTH - 1, num_trajs),  # -1 for next_obs
     )
-
-    actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, batch_size)
-    critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, batch_size)
+    actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, num_trajs)
+    critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, num_trajs)
 
     data_dict["actor_obs"] = actor_obs[:-1]
     data_dict["next_actor_obs"] = actor_obs[1:]
@@ -102,17 +118,23 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"):
 
     actions_idx = DATA_LIST.index("dof_pos_target")
     actions = torch.tensor(data_struct[actions_idx]).to(DEVICE)
-    actions = actions.reshape((batch_size, 1, -1))  # shape (batch_size, 1, n)
+    actions = actions[: TRAJ_LENGTH * num_trajs, :]
+    actions = actions.reshape((TRAJ_LENGTH, num_trajs, -1))
     data_dict["actions"] = actions[:-1]
 
-    rewards = get_rewards(data_struct, batch_size)
-    rewards = rewards.reshape((batch_size, 1))  # shape (batch_size, 1)
+    rewards = get_rewards(data_struct, num_trajs)
     data_dict["rewards"] = rewards[:-1]
 
     # Assume no termination
-    data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
-    data_dict["terminal"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
-    data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
+    data_dict["timed_out"] = torch.zeros(
+        TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool
+    )
+    data_dict["terminal"] = torch.zeros(
+        TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool
+    )
+    data_dict["dones"] = torch.zeros(
+        TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool
+    )
 
     return data_dict
 
@@ -124,7 +146,9 @@ def setup():
     train_cfg = class_to_dict(train_cfg)
     data_dict = get_data_dict(train_cfg)
 
-    runner = FineTuneRunner(train_cfg, data_dict, device=DEVICE)
+    runner = FineTuneRunner(
+        train_cfg, data_dict, exploration_scale=EXPLORATION_SCALE, device=DEVICE
+    )
     runner._set_up_alg()
 
     return runner
@@ -145,13 +169,18 @@ def finetune(runner):
         runner.data_dict["actor_obs"]
     )
     diff = actions_new - actions_old
-    print("Mean action diff per actuator: ", diff.mean(dim=0))
+    print("Mean action diff per actuator: ", diff.mean(dim=(0, 1)))
 
     save_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION + 1) + ".pt")
     runner.save(save_path)
 
-    # export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_finetuned")
-    # runner.export(export_path)
+    export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_finetuned")
+    runner.export(export_path)
+
+    if PLOT:
+        plt.title("Rewards")
+        plt.legend()
+        plt.show()
 
 
 if __name__ == "__main__":

From f3a4cc426c7749a664e0da50c6e0b9ef9a38fc23 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Thu, 11 Jul 2024 14:56:04 -0400
Subject: [PATCH 27/42] added min base height reward

---
 gym/envs/mini_cheetah/minimalist_cheetah.py | 16 ++++++++++++++--
 scripts/finetune.py                         |  4 +++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py
index 658bb9a3..c87db2e5 100644
--- a/gym/envs/mini_cheetah/minimalist_cheetah.py
+++ b/gym/envs/mini_cheetah/minimalist_cheetah.py
@@ -31,12 +31,24 @@ def set_states(
         phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0)
         self.phase_sin = phase_obs[:, 0].unsqueeze(0)
 
-    def _switch(self):
+    def _sqrdexp(self, x, scale=1.0):
+        """shorthand helper for squared exponential"""
+        return torch.exp(-torch.square(x / scale) / self.tracking_sigma)
+
+    def _switch(self, scale=0.5):
+        # TODO: Check scale, RS commands are scaled differently than QGym
         c_vel = torch.linalg.norm(self.commands, dim=1)
         return torch.exp(
-            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1
+            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - scale)) / scale
         )
 
+    def _reward_min_base_height(self, target_height=0.3, scale=0.3):
+        """Squared exponential saturating at base_height target"""
+        # TODO: Check scale
+        error = (self.base_height - target_height) / scale
+        error = torch.clamp(error, max=0, min=None).flatten()
+        return self._sqrdexp(error)
+
     def _reward_tracking_lin_vel(self):
         """Tracking of linear velocity commands (xy axes)"""
         # just use lin_vel?
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 7fc3f270..ebeaa6dc 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -43,6 +43,7 @@
 
 # Finetune for these rewards instead of the ones in the cfg
 REWARD_WEIGHTS = {
+    "min_base_height": 1.5,
     # "tracking_lin_vel": 4.0,
     # "tracking_ang_vel": 2.0,
     # "orientation": 1.0,
@@ -91,7 +92,8 @@ def get_rewards(data_struct, batches):
 
     if PLOT:
         for name, rewards in rewards_dict.items():
-            plt.plot(rewards, label=name)
+            n_steps = 200
+            plt.plot(rewards[:n_steps], label=name)
 
     return rewards_all.float()
 

From fdc028fbcc0d05a00f144873c68fd40a7d9a9ffb Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Sat, 13 Jul 2024 19:20:45 -0400
Subject: [PATCH 28/42] reset 10% standstill/pure lin/pure ang vel; set
 rewards: grf=3, standstill=2

---
 gym/envs/mini_cheetah/mini_cheetah_ref.py     | 20 ++++++++++++----
 .../mini_cheetah/mini_cheetah_ref_config.py   | 14 +++++------
 learning/runners/on_policy_runner.py          |  4 +---
 scripts/finetune.py                           | 23 ++++++++++++-------
 scripts/generate_commands.py                  | 21 +++++++++++++++++
 5 files changed, 59 insertions(+), 23 deletions(-)
 create mode 100644 scripts/generate_commands.py

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py
index 57e1f0e7..e620a3c2 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py
@@ -44,11 +44,21 @@ def _post_decimation_step(self):
 
     def _resample_commands(self, env_ids):
         super()._resample_commands(env_ids)
-        # * with 10% chance, reset to 0 commands
-        rand_ids = torch_rand_float(
-            0, 1, (len(env_ids), 1), device=self.device
-        ).squeeze(1)
-        self.commands[env_ids, :3] *= (rand_ids < 0.9).unsqueeze(1)
+        # * with 10% chance, reset to 0 commands except for forward
+        self.commands[env_ids, 1:] *= (
+            torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+            < 0.9
+        ).unsqueeze(1)
+        # * with 10% chance, reset to 0 commands except for rotation
+        self.commands[env_ids, :2] *= (
+            torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+            < 0.9
+        ).unsqueeze(1)
+        # * with 10% chance, reset to 0
+        self.commands[env_ids, :] *= (
+            torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+            < 0.9
+        ).unsqueeze(1)
 
     def _switch(self):
         c_vel = torch.linalg.norm(self.commands, dim=1)
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 09970e66..153e54ca 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -129,13 +129,13 @@ class weights:
                 collision = 0.0
                 action_rate = 0.01
                 action_rate2 = 0.001
-                stand_still = 0.0
+                stand_still = 2.0
                 dof_pos_limits = 0.0
                 feet_contact_forces = 0.0
                 dof_near_home = 0.0
-                reference_traj = 1.5
-                swing_grf = 1.5
-                stance_grf = 1.5
+                reference_traj = 0.0
+                swing_grf = 3.0
+                stance_grf = 3.0
 
             class termination_weight:
                 termination = 0.15
@@ -151,9 +151,9 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         storage_size = 4 * 32 * 4096  # num_policies*num_stpes*num_envs
 
         # Finetuning
-        learning_rate = 3e-4
-        max_gradient_steps = 10
-        batch_size = 4096
+        # learning_rate = 3e-4
+        # max_gradient_steps = 10
+        # batch_size = 4096
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index c98d088c..c3465433 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -181,9 +181,7 @@ def set_up_logger(self):
             self.alg,
             ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
         )
-        logger.register_category(
-            "actor", self.alg.actor, ["action_std", "entropy", "get_std"]
-        )
+        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
 
         logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 
diff --git a/scripts/finetune.py b/scripts/finetune.py
index ebeaa6dc..41c24209 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -11,17 +11,20 @@
 import os
 import torch
 import scipy.io
+import numpy as np
 from tensordict import TensorDict
 import matplotlib.pyplot as plt
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
-LOAD_RUN = "Jul08_13-02-15_PPO32_S16"
-LOG_FILE = "PPO32_S16_Jul08_expl08.mat"
+LOAD_RUN = "Jul11_17-55-01_PPO32_S16_grf3"
+LOG_FILE = "PPO32_S16_grf3_expl1.mat"
 
 TRAJ_LENGTH = 100  # split data into chunks of this length
-EXPLORATION_SCALE = 0.8  # scale std in SmoothActor
-ITERATION = 500
+EXPLORATION_SCALE = 1  # scale std in SmoothActor
+ITERATION = 700  # load this iteration
+
 PLOT = True
+PLOT_N = 300  # number of steps to plot
 
 # Data struct fields from Robot-Software logs
 DATA_LIST = [
@@ -90,10 +93,14 @@ def get_rewards(data_struct, batches):
         rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
     rewards_all = rewards_all.reshape((TRAJ_LENGTH, batches))
 
-    if PLOT:
-        for name, rewards in rewards_dict.items():
-            n_steps = 200
-            plt.plot(rewards[:n_steps], label=name)
+    rewards_stats = {}
+    for name, rewards in rewards_dict.items():
+        rewards = np.array(rewards)
+        rewards_stats[name] = [rewards.mean(), rewards.std()]
+        if PLOT:
+            plt.plot(rewards[:PLOT_N], label=name)
+
+    print(rewards_stats)
 
     return rewards_all.float()
 
diff --git a/scripts/generate_commands.py b/scripts/generate_commands.py
new file mode 100644
index 00000000..c1de1425
--- /dev/null
+++ b/scripts/generate_commands.py
@@ -0,0 +1,21 @@
+import numpy as np
+import random
+
+# Command ranges
+x_range = [-0.5, 1]  # [m/s]
+y_range = [-0.25, 0.25]  # [m/s]
+yaw_range = [-1, 1]  # [rad/s]
+
+# Generate random command sequence
+N = 10
+steps = 300
+commands = np.zeros((N * steps, 3))
+
+for i in range(1, N):  # first iteration 0 commands
+    x = random.uniform(x_range[0], x_range[1])
+    y = random.uniform(y_range[0], y_range[1])
+    yaw = random.uniform(yaw_range[0], yaw_range[1])
+    commands[i * steps : (i + 1) * steps] = np.array([x, y, yaw])
+
+# Export to txt
+np.savetxt("commands.txt", commands, fmt="%.3f")

From 3006c4c8ef54e32cbe83d1334905f8b9b5f853d4 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 15 Jul 2024 13:47:57 -0400
Subject: [PATCH 29/42] evaluate rewards and store in dataframe, added stand
 still/action rate

---
 gym/envs/mini_cheetah/minimalist_cheetah.py |  69 ++++++++++-
 scripts/eval_rewards.py                     | 126 ++++++++++++++++++++
 scripts/finetune.py                         | 100 +++++++++-------
 3 files changed, 249 insertions(+), 46 deletions(-)
 create mode 100644 scripts/eval_rewards.py

diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py
index c87db2e5..bb67fb69 100644
--- a/gym/envs/mini_cheetah/minimalist_cheetah.py
+++ b/gym/envs/mini_cheetah/minimalist_cheetah.py
@@ -6,10 +6,20 @@ class MinimalistCheetah:
     Helper class for computing mini cheetah rewards
     """
 
-    def __init__(self, device="cpu", tracking_sigma=0.25):
+    def __init__(self, device="cpu", dt=0.01, tracking_sigma=0.25):
         self.device = device
+        self.dt = dt
         self.tracking_sigma = tracking_sigma
 
+        # Default joint angles from mini_cheetah_config.py
+        self.default_dof_pos = torch.tensor(
+            [0.0, -0.785398, 1.596976], device=self.device
+        ).repeat(4)
+
+        # Previous 2 dof pos targets
+        self.dof_target_prev = None
+        self.dof_target_prev2 = None
+
     def set_states(
         self,
         base_height,
@@ -17,8 +27,11 @@ def set_states(
         base_ang_vel,
         proj_gravity,
         commands,
+        dof_pos_obs,
+        dof_vel,
         phase_obs,
         grf,
+        dof_pos_target,
     ):
         # Unsqueeze so first dim is batch_size
         self.base_height = torch.tensor(base_height, device=self.device).unsqueeze(0)
@@ -26,20 +39,36 @@ def set_states(
         self.base_ang_vel = torch.tensor(base_ang_vel, device=self.device).unsqueeze(0)
         self.proj_gravity = torch.tensor(proj_gravity, device=self.device).unsqueeze(0)
         self.commands = torch.tensor(commands, device=self.device).unsqueeze(0)
+        self.dof_pos_obs = torch.tensor(dof_pos_obs, device=self.device).unsqueeze(0)
+        self.dof_vel = torch.tensor(dof_vel, device=self.device).unsqueeze(0)
         self.grf = torch.tensor(grf, device=self.device).unsqueeze(0)
 
+        # Compute phase sin
         phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0)
         self.phase_sin = phase_obs[:, 0].unsqueeze(0)
 
+        # Set targets
+        self.dof_pos_target = torch.tensor(
+            dof_pos_target, device=self.device
+        ).unsqueeze(0)
+        if self.dof_target_prev is None:
+            self.dof_target_prev = self.dof_pos_target
+        if self.dof_target_prev2 is None:
+            self.dof_target_prev2 = self.dof_target_prev
+
+    def post_process(self):
+        self.dof_target_prev2 = self.dof_target_prev
+        self.dof_target_prev = self.dof_pos_target
+
     def _sqrdexp(self, x, scale=1.0):
         """shorthand helper for squared exponential"""
         return torch.exp(-torch.square(x / scale) / self.tracking_sigma)
 
-    def _switch(self, scale=0.5):
+    def _switch(self, scale=0.1):
         # TODO: Check scale, RS commands are scaled differently than QGym
         c_vel = torch.linalg.norm(self.commands, dim=1)
         return torch.exp(
-            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - scale)) / scale
+            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / scale
         )
 
     def _reward_min_base_height(self, target_height=0.3, scale=0.3):
@@ -73,11 +102,41 @@ def _reward_swing_grf(self, contact_thresh=0.5):
         in_contact = torch.gt(self.grf, contact_thresh)
         ph_off = torch.gt(self.phase_sin, 0)  # phase <= pi
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
-        return -torch.sum(rew.float(), dim=1)  # * (1 - self._switch())
+        return -torch.sum(rew.float(), dim=1) * (1 - self._switch())
 
     def _reward_stance_grf(self, contact_thresh=0.5):
         """Reward non-zero grf during stance (pi to 2pi)"""
         in_contact = torch.gt(self.grf, contact_thresh)
         ph_off = torch.lt(self.phase_sin, 0)  # phase >= pi
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
-        return torch.sum(rew.float(), dim=1)  # * (1 - self._switch())
+        return torch.sum(rew.float(), dim=1) * (1 - self._switch())
+
+    def _reward_stand_still(self):
+        """Penalize motion at zero commands"""
+        # * normalize angles so we care about being within 5 deg
+        rew_pos = torch.mean(
+            self._sqrdexp((self.dof_pos_obs) / torch.pi * 36),
+            dim=1,
+        )
+        rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1)
+        rew_base_vel = 0  # TODO: This seems too noisy
+        # rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1)
+        # rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1)
+        return (rew_vel + rew_pos - rew_base_vel) * self._switch()
+
+    def _reward_action_rate(self):
+        """Penalize changes in actions"""
+        # TODO: check this
+        error = torch.square(self.dof_pos_target - self.dof_target_prev) / self.dt**2
+        return -torch.sum(error, dim=1)
+
+    def _reward_action_rate2(self):
+        """Penalize changes in actions"""
+        # TODO: check this
+        error = (
+            torch.square(
+                self.dof_pos_target - 2 * self.dof_target_prev + self.dof_target_prev2
+            )
+            / self.dt
+        )
+        return -torch.sum(error, dim=1)
diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py
new file mode 100644
index 00000000..15b287f3
--- /dev/null
+++ b/scripts/eval_rewards.py
@@ -0,0 +1,126 @@
+from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah
+
+from gym import LEGGED_GYM_ROOT_DIR
+
+import os
+import torch
+import scipy.io
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
+LOAD_RUN = "Jul13_01-49-59_PPO32_S16"
+LOG_FILE = "PPO_701_a001.mat"
+REWARDS_FILE = "rewards_a001.csv"
+
+ITERATION = 701  # load this iteration
+
+PLOT = True
+PLOT_N = 1000  # number of steps to plot
+
+# Data struct fields from Robot-Software logs
+DATA_LIST = [
+    "header",
+    "base_height",  # 1 shape: (1, data_length)
+    # the following are all shape: (data_length, n)
+    "base_lin_vel",  # 2
+    "base_ang_vel",  # 3
+    "projected_gravity",  # 4
+    "commands",  # 5
+    "dof_pos_obs",  # 6
+    "dof_vel",  # 7
+    "phase_obs",  # 8
+    "grf",  # 9
+    "dof_pos_target",  # 10
+    "exploration_noise",  # 11
+    "footer",
+]
+
+# Finetune for these rewards instead of the ones in the cfg
+REWARD_WEIGHTS = {
+    "min_base_height": 1.5,
+    # "tracking_lin_vel": 4.0,
+    # "tracking_ang_vel": 2.0,
+    "orientation": 1.0,
+    "swing_grf": 1.0,
+    "stance_grf": 1.0,
+    "stand_still": 1.0,
+    "action_rate": 0.001,
+    "action_rate2": 0.001,
+}
+
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def get_rewards(data_struct):
+    data_length = data_struct[1].shape[1]  # base_height: shape (1, data_length)
+
+    mini_cheetah = MinimalistCheetah(device=DEVICE)
+    rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()}  # for plotting
+    rewards_all = torch.empty(0).to(DEVICE)
+
+    for i in range(data_length):
+        mini_cheetah.set_states(
+            base_height=data_struct[1][:, i],  # shape (1, data_length)
+            base_lin_vel=data_struct[2][i],
+            base_ang_vel=data_struct[3][i],
+            proj_gravity=data_struct[4][i],
+            commands=data_struct[5][i],
+            dof_pos_obs=data_struct[6][i],
+            dof_vel=data_struct[7][i],
+            phase_obs=data_struct[8][i],
+            grf=data_struct[9][i],
+            dof_pos_target=data_struct[10][i],
+        )
+        total_rewards = 0
+        for name, weight in REWARD_WEIGHTS.items():
+            reward = weight * eval(f"mini_cheetah._reward_{name}()")
+            rewards_dict[name].append(reward.item())
+            total_rewards += reward
+        rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
+        # Post process mini cheetah
+        mini_cheetah.post_process()
+
+    rewards_dict["total"] = rewards_all.tolist()
+
+    # Save rewards in dataframe
+    rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
+    if not os.path.exists(rewards_path):
+        rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"])
+    else:
+        rewards_df = pd.read_csv(rewards_path)
+    for name, rewards in rewards_dict.items():
+        rewards = np.array(rewards)
+        mean = rewards.mean()
+        std = rewards.std()
+        rewards_df = rewards_df._append(
+            {
+                "iteration": ITERATION,
+                "type": name,
+                "mean": mean,
+                "std": std,
+            },
+            ignore_index=True,
+        )
+        if PLOT:
+            plt.plot(rewards[:PLOT_N], label=name)
+    rewards_df.to_csv(rewards_path, index=False)
+    print(rewards_df)
+
+
+def setup(name="SMOOTH_RL_CONTROLLER"):
+    path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE)
+    data = scipy.io.loadmat(path)
+    data_struct = data[name][0][0]
+    return data_struct
+
+
+if __name__ == "__main__":
+    data_struct = setup()
+    get_rewards(data_struct)
+
+    if PLOT:
+        plt.title("Rewards")
+        plt.legend()
+        plt.show()
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 41c24209..f4cb44c7 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -12,35 +12,33 @@
 import torch
 import scipy.io
 import numpy as np
+import pandas as pd
 from tensordict import TensorDict
-import matplotlib.pyplot as plt
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
-LOAD_RUN = "Jul11_17-55-01_PPO32_S16_grf3"
-LOG_FILE = "PPO32_S16_grf3_expl1.mat"
+LOAD_RUN = "Jul13_01-49-59_PPO32_S16"
+LOG_FILE = "PPO_701_a001_expl08.mat"
+REWARDS_FILE = "rewards_test.csv"
 
 TRAJ_LENGTH = 100  # split data into chunks of this length
-EXPLORATION_SCALE = 1  # scale std in SmoothActor
-ITERATION = 700  # load this iteration
-
-PLOT = True
-PLOT_N = 300  # number of steps to plot
+EXPLORATION_SCALE = 0.8  # scale std in SmoothActor
+ITERATION = 701  # load this iteration
 
 # Data struct fields from Robot-Software logs
 DATA_LIST = [
     "header",
-    "base_height",  # shape (1, data_length)
-    # the following are all shape (data_length, n)
-    "base_lin_vel",
-    "base_ang_vel",
-    "projected_gravity",
-    "commands",
-    "dof_pos_obs",
-    "dof_vel",
-    "phase_obs",
-    "grf",
-    "dof_pos_target",
-    "exploration_noise",
+    "base_height",  # 1 shape: (1, data_length)
+    # the following are all shape: (data_length, n)
+    "base_lin_vel",  # 2
+    "base_ang_vel",  # 3
+    "projected_gravity",  # 4
+    "commands",  # 5
+    "dof_pos_obs",  # 6
+    "dof_vel",  # 7
+    "phase_obs",  # 8
+    "grf",  # 9
+    "dof_pos_target",  # 10
+    "exploration_noise",  # 11
     "footer",
 ]
 
@@ -49,9 +47,12 @@
     "min_base_height": 1.5,
     # "tracking_lin_vel": 4.0,
     # "tracking_ang_vel": 2.0,
-    # "orientation": 1.0,
-    "swing_grf": 1.5,
-    "stance_grf": 1.5,
+    "orientation": 1.0,
+    "swing_grf": 1.0,
+    "stance_grf": 1.0,
+    "stand_still": 1.0,
+    "action_rate": 0.001,
+    "action_rate2": 0.001,
 }
 
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -71,37 +72,59 @@ def get_obs(obs_list, data_struct, batches):
     return obs_all.float()
 
 
-def get_rewards(data_struct, batches):
+def get_rewards(data_struct, num_trajs):
+    mini_cheetah = MinimalistCheetah(device=DEVICE)
     rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()}  # for plotting
     rewards_all = torch.empty(0).to(DEVICE)
-    for i in range(TRAJ_LENGTH * batches):
-        mini_cheetah = MinimalistCheetah(device=DEVICE)
+
+    for i in range(TRAJ_LENGTH * num_trajs):
         mini_cheetah.set_states(
             base_height=data_struct[1][:, i],  # shape (1, data_length)
             base_lin_vel=data_struct[2][i],
             base_ang_vel=data_struct[3][i],
             proj_gravity=data_struct[4][i],
             commands=data_struct[5][i],
+            dof_pos_obs=data_struct[6][i],
+            dof_vel=data_struct[7][i],
             phase_obs=data_struct[8][i],
             grf=data_struct[9][i],
+            dof_pos_target=data_struct[10][i],
         )
         total_rewards = 0
         for name, weight in REWARD_WEIGHTS.items():
-            reward = eval(f"mini_cheetah._reward_{name}()")
+            reward = weight * eval(f"mini_cheetah._reward_{name}()")
             rewards_dict[name].append(reward.item())
-            total_rewards += weight * reward
+            total_rewards += reward
         rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
-    rewards_all = rewards_all.reshape((TRAJ_LENGTH, batches))
+        # Post process mini cheetah
+        mini_cheetah.post_process()
+
+    rewards_dict["total"] = rewards_all.tolist()
 
-    rewards_stats = {}
+    # Save rewards in dataframe
+    rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
+    if not os.path.exists(rewards_path):
+        rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"])
+    else:
+        rewards_df = pd.read_csv(rewards_path)
     for name, rewards in rewards_dict.items():
         rewards = np.array(rewards)
-        rewards_stats[name] = [rewards.mean(), rewards.std()]
-        if PLOT:
-            plt.plot(rewards[:PLOT_N], label=name)
-
-    print(rewards_stats)
+        mean = rewards.mean()
+        std = rewards.std()
+        rewards_df = rewards_df._append(
+            {
+                "iteration": str(ITERATION) + "_expl",
+                "type": name,
+                "mean": mean,
+                "std": std,
+            },
+            ignore_index=True,
+        )
+    rewards_df.to_csv(rewards_path, index=False)
+    print(rewards_df)
 
+    # Return total rewards
+    rewards_all = rewards_all.reshape((TRAJ_LENGTH, num_trajs))
     return rewards_all.float()
 
 
@@ -183,14 +206,9 @@ def finetune(runner):
     save_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION + 1) + ".pt")
     runner.save(save_path)
 
-    export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_finetuned")
+    export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_" + str(ITERATION + 1))
     runner.export(export_path)
 
-    if PLOT:
-        plt.title("Rewards")
-        plt.legend()
-        plt.show()
-
 
 if __name__ == "__main__":
     runner = setup()

From 27e756849e11fac77c9e9eb7ba8ff94ffe3f043e Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 15 Jul 2024 16:47:32 -0400
Subject: [PATCH 30/42] handle IPG in FintuneRunner, and revert storage to
 single traj

---
 learning/runners/finetune_runner.py |  67 +++++++++++++++---
 scripts/finetune.py                 | 104 ++++++++++++++--------------
 2 files changed, 108 insertions(+), 63 deletions(-)

diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py
index 4cfdf895..fdff23b4 100644
--- a/learning/runners/finetune_runner.py
+++ b/learning/runners/finetune_runner.py
@@ -24,9 +24,25 @@ def _set_up_alg(self):
             )
         else:
             actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
-        critic = Critic(num_critic_obs, **self.critic_cfg)
-        alg_class = eval(self.cfg["algorithm_class_name"])
-        self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
+
+        alg_class_name = self.cfg["algorithm_class_name"]
+        alg_class = eval(alg_class_name)
+
+        if alg_class_name == "PPO_IPG":
+            critic_v = Critic(num_critic_obs, **self.critic_cfg)
+            critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+            target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
+            self.alg = alg_class(
+                actor,
+                critic_v,
+                critic_q,
+                target_critic_q,
+                device=self.device,
+                **self.alg_cfg,
+            )
+        else:
+            critic = Critic(num_critic_obs, **self.critic_cfg)
+            self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
 
     def parse_train_cfg(self, train_cfg):
         self.cfg = train_cfg["runner"]
@@ -37,9 +53,27 @@ def parse_train_cfg(self, train_cfg):
 
     def learn(self):
         # Single update on data dict
-        self.alg.update(self.data_dict)
+        if self.cfg["algorithm_class_name"] == "PPO_IPG":
+            # TODO: How to handle off-policy data
+            self.alg.update(self.data_dict, self.data_dict)
+        else:
+            self.alg.update(self.data_dict)
 
     def save(self, path):
+        if self.cfg["algorithm_class_name"] == "PPO_IPG":
+            torch.save(
+                {
+                    "actor_state_dict": self.alg.actor.state_dict(),
+                    "critic_v_state_dict": self.alg.critic_v.state_dict(),
+                    "critic_q_state_dict": self.alg.critic_q.state_dict(),
+                    "optimizer_state_dict": self.alg.optimizer.state_dict(),
+                    "critic_v_opt_state_dict": self.alg.critic_v_optimizer.state_dict(),
+                    "critic_q_opt_state_dict": self.alg.critic_q_optimizer.state_dict(),
+                    "iter": 1,  # only one iteration
+                },
+                path,
+            )
+            return
         torch.save(
             {
                 "actor_state_dict": self.alg.actor.state_dict(),
@@ -54,12 +88,25 @@ def save(self, path):
     def load(self, path, load_optimizer=True):
         loaded_dict = torch.load(path)
         self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"])
-        self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"])
-        if load_optimizer:
-            self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
-            self.alg.critic_optimizer.load_state_dict(
-                loaded_dict["critic_optimizer_state_dict"]
-            )
+
+        if self.cfg["algorithm_class_name"] == "PPO_IPG":
+            self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"])
+            self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"])
+            if load_optimizer:
+                self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
+                self.alg.critic_v_optimizer.load_state_dict(
+                    loaded_dict["critic_v_opt_state_dict"]
+                )
+                self.alg.critic_q_optimizer.load_state_dict(
+                    loaded_dict["critic_q_opt_state_dict"]
+                )
+        else:
+            self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"])
+            if load_optimizer:
+                self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
+                self.alg.critic_optimizer.load_state_dict(
+                    loaded_dict["critic_optimizer_state_dict"]
+                )
 
     def export(self, path):
         # Need to make a copy of actor
diff --git a/scripts/finetune.py b/scripts/finetune.py
index f4cb44c7..ab97b5bd 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -17,12 +17,20 @@
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
 LOAD_RUN = "Jul13_01-49-59_PPO32_S16"
-LOG_FILE = "PPO_701_a001_expl08.mat"
+# LOAD_RUN = "Jul12_15-53-57_IPG32_S16"
+LOG_FILE = "PPO_700_expl08.mat"
+ITERATION = 700  # load this iteration
+
+# IPG_BUFFER = [
+#     "IPG_700_expl08.mat",
+#     "IPG_701_expl08.mat",
+# ]
+
+LOG_REWARDS = False  # whether to log to dataframe
 REWARDS_FILE = "rewards_test.csv"
 
 TRAJ_LENGTH = 100  # split data into chunks of this length
 EXPLORATION_SCALE = 0.8  # scale std in SmoothActor
-ITERATION = 701  # load this iteration
 
 # Data struct fields from Robot-Software logs
 DATA_LIST = [
@@ -58,28 +66,25 @@
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
-def get_obs(obs_list, data_struct, batches):
+def get_obs(obs_list, data_struct, batch_size):
     obs_all = torch.empty(0).to(DEVICE)
     for obs_name in obs_list:
         data_idx = DATA_LIST.index(obs_name)
         obs = torch.tensor(data_struct[data_idx]).to(DEVICE)
-        if obs.shape[0] == 1:
-            obs = obs.T  # base_height has shape (1, data_length)
-        obs = obs[: TRAJ_LENGTH * batches, :]
-        obs = obs.reshape((TRAJ_LENGTH, batches, -1))
+        obs = obs.reshape((batch_size, 1, -1))  # shape (batch_size, 1, n)
         obs_all = torch.cat((obs_all, obs), dim=-1)
 
     return obs_all.float()
 
 
-def get_rewards(data_struct, num_trajs):
+def get_rewards(data_struct, batch_size):
     mini_cheetah = MinimalistCheetah(device=DEVICE)
     rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()}  # for plotting
     rewards_all = torch.empty(0).to(DEVICE)
 
-    for i in range(TRAJ_LENGTH * num_trajs):
+    for i in range(batch_size):
         mini_cheetah.set_states(
-            base_height=data_struct[1][:, i],  # shape (1, data_length)
+            base_height=data_struct[1][:, i],  # shape (1, batch_size)
             base_lin_vel=data_struct[2][i],
             base_ang_vel=data_struct[3][i],
             proj_gravity=data_struct[4][i],
@@ -101,30 +106,28 @@ def get_rewards(data_struct, num_trajs):
 
     rewards_dict["total"] = rewards_all.tolist()
 
-    # Save rewards in dataframe
-    rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
-    if not os.path.exists(rewards_path):
-        rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"])
-    else:
-        rewards_df = pd.read_csv(rewards_path)
-    for name, rewards in rewards_dict.items():
-        rewards = np.array(rewards)
-        mean = rewards.mean()
-        std = rewards.std()
-        rewards_df = rewards_df._append(
-            {
-                "iteration": str(ITERATION) + "_expl",
-                "type": name,
-                "mean": mean,
-                "std": std,
-            },
-            ignore_index=True,
-        )
-    rewards_df.to_csv(rewards_path, index=False)
-    print(rewards_df)
+    if LOG_REWARDS:
+        rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
+        if not os.path.exists(rewards_path):
+            rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"])
+        else:
+            rewards_df = pd.read_csv(rewards_path)
+        for name, rewards in rewards_dict.items():
+            rewards = np.array(rewards)
+            mean = rewards.mean()
+            std = rewards.std()
+            rewards_df = rewards_df._append(
+                {
+                    "iteration": str(ITERATION) + "_expl",
+                    "type": name,
+                    "mean": mean,
+                    "std": std,
+                },
+                ignore_index=True,
+            )
+        rewards_df.to_csv(rewards_path, index=False)
+        print(rewards_df)
 
-    # Return total rewards
-    rewards_all = rewards_all.reshape((TRAJ_LENGTH, num_trajs))
     return rewards_all.float()
 
 
@@ -132,16 +135,14 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"):
     path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE)
     data = scipy.io.loadmat(path)
     data_struct = data[name][0][0]
-
-    data_length = data_struct[1].shape[1]  # base_height: shape (1, data_length)
-    num_trajs = data_length // TRAJ_LENGTH
+    batch_size = data_struct[1].shape[1]  # base_height: shape (1, batch_size)
     data_dict = TensorDict(
         {},
         device=DEVICE,
-        batch_size=(TRAJ_LENGTH - 1, num_trajs),  # -1 for next_obs
+        batch_size=(batch_size - 1, 1),  # -1 for next_obs
     )
-    actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, num_trajs)
-    critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, num_trajs)
+    actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, batch_size)
+    critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, batch_size)
 
     data_dict["actor_obs"] = actor_obs[:-1]
     data_dict["next_actor_obs"] = actor_obs[1:]
@@ -149,24 +150,18 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"):
     data_dict["next_critic_obs"] = critic_obs[1:]
 
     actions_idx = DATA_LIST.index("dof_pos_target")
-    actions = torch.tensor(data_struct[actions_idx]).to(DEVICE)
-    actions = actions[: TRAJ_LENGTH * num_trajs, :]
-    actions = actions.reshape((TRAJ_LENGTH, num_trajs, -1))
+    actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float()
+    actions = actions.reshape((batch_size, 1, -1))  # shape (batch_size, 1, n)
     data_dict["actions"] = actions[:-1]
 
-    rewards = get_rewards(data_struct, num_trajs)
+    rewards = get_rewards(data_struct, batch_size)
+    rewards = rewards.reshape((batch_size, 1))  # shape (batch_size, 1)
     data_dict["rewards"] = rewards[:-1]
 
     # Assume no termination
-    data_dict["timed_out"] = torch.zeros(
-        TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool
-    )
-    data_dict["terminal"] = torch.zeros(
-        TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool
-    )
-    data_dict["dones"] = torch.zeros(
-        TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool
-    )
+    data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
+    data_dict["terminal"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
+    data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
 
     return data_dict
 
@@ -179,7 +174,10 @@ def setup():
     data_dict = get_data_dict(train_cfg)
 
     runner = FineTuneRunner(
-        train_cfg, data_dict, exploration_scale=EXPLORATION_SCALE, device=DEVICE
+        train_cfg,
+        data_dict,
+        exploration_scale=EXPLORATION_SCALE,
+        device=DEVICE,
     )
     runner._set_up_alg()
 

From 0a40204aa84698f43ab4076f3965afcfab86ad2c Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Tue, 16 Jul 2024 18:41:39 -0400
Subject: [PATCH 31/42] collect sim data for finetuning, added finetune config

---
 gym/envs/__init__.py                          |   7 +
 .../mini_cheetah_finetune_config.py           | 149 ++++++++++++++++++
 learning/runners/finetune_runner.py           | 127 ++++++++++++++-
 scripts/finetune.py                           | 129 +++++++--------
 4 files changed, 348 insertions(+), 64 deletions(-)
 create mode 100644 gym/envs/mini_cheetah/mini_cheetah_finetune_config.py

diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py
index 57cbd8c0..0736c973 100644
--- a/gym/envs/__init__.py
+++ b/gym/envs/__init__.py
@@ -27,6 +27,7 @@
     "MiniCheetahCfg": ".mini_cheetah.mini_cheetah_config",
     "MiniCheetahRefCfg": ".mini_cheetah.mini_cheetah_ref_config",
     "MiniCheetahOscCfg": ".mini_cheetah.mini_cheetah_osc_config",
+    "MiniCheetahFineTuneCfg": ".mini_cheetah.mini_cheetah_finetune_config",
     "MITHumanoidCfg": ".mit_humanoid.mit_humanoid_config",
     "A1Cfg": ".a1.a1_config",
     "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config",
@@ -39,6 +40,7 @@
     "MiniCheetahRunnerCfg": ".mini_cheetah.mini_cheetah_config",
     "MiniCheetahRefRunnerCfg": ".mini_cheetah.mini_cheetah_ref_config",
     "MiniCheetahOscRunnerCfg": ".mini_cheetah.mini_cheetah_osc_config",
+    "MiniCheetahFineTuneRunnerCfg": ".mini_cheetah.mini_cheetah_finetune_config",
     "MITHumanoidRunnerCfg": ".mit_humanoid.mit_humanoid_config",
     "A1RunnerCfg": ".a1.a1_config",
     "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config",
@@ -59,6 +61,11 @@
         "MiniCheetahOscCfg",
         "MiniCheetahOscRunnerCfg",
     ],
+    "mini_cheetah_finetune": [
+        "MiniCheetahRef",
+        "MiniCheetahFineTuneCfg",
+        "MiniCheetahFineTuneRunnerCfg",
+    ],
     "humanoid": ["MIT_Humanoid", "MITHumanoidCfg", "MITHumanoidRunnerCfg"],
     "humanoid_running": [
         "HumanoidRunning",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
new file mode 100644
index 00000000..e07ee45a
--- /dev/null
+++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
@@ -0,0 +1,149 @@
+from gym.envs.mini_cheetah.mini_cheetah_ref_config import (
+    MiniCheetahRefCfg,
+    MiniCheetahRefRunnerCfg,
+)
+
+BASE_HEIGHT_REF = 0.33
+
+
+class MiniCheetahFineTuneCfg(MiniCheetahRefCfg):
+    class env(MiniCheetahRefCfg.env):
+        num_envs = 1
+        num_actuators = 12
+        episode_length_s = 30.0
+
+    class terrain(MiniCheetahRefCfg.terrain):
+        pass
+
+    class init_state(MiniCheetahRefCfg.init_state):
+        pass
+
+    class control(MiniCheetahRefCfg.control):
+        # * PD Drive parameters:
+        stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0}
+        damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5}
+        gait_freq = 3.0
+        ctrl_frequency = 50
+        desired_sim_frequency = 500
+
+    class commands(MiniCheetahRefCfg.commands):
+        # * time before command are changed[s]
+        resampling_time = 3.0
+
+        # * smaller ranges for finetuning
+        class ranges:
+            lin_vel_x = [-1.0, 1.5]  # min max [m/s]
+            lin_vel_y = 0.5  # max [m/s]
+            yaw_vel = 1.5  # max [rad/s]
+
+    class push_robots(MiniCheetahRefCfg.push_robots):
+        pass
+
+    class domain_rand(MiniCheetahRefCfg.domain_rand):
+        randomize_friction = False
+        friction_range = [0.5, 1.0]
+        randomize_base_mass = True
+        added_mass_range = [5.0, 5.0]
+
+    class asset(MiniCheetahRefCfg.asset):
+        pass
+
+    class reward_settings(MiniCheetahRefCfg.reward_settings):
+        pass
+
+    class scaling(MiniCheetahRefCfg.scaling):
+        pass
+
+
+class MiniCheetahFineTuneRunnerCfg(MiniCheetahRefRunnerCfg):
+    seed = -1
+    runner_class_name = "OnPolicyRunner"
+
+    class actor:
+        hidden_dims = [256, 256, 128]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        smooth_exploration = True
+        exploration_sample_freq = 16
+
+        normalize_obs = True
+        obs = [
+            "base_ang_vel",
+            "projected_gravity",
+            "commands",
+            "dof_pos_obs",
+            "dof_vel",
+            "phase_obs",
+        ]
+
+        actions = ["dof_pos_target"]
+        disable_actions = False
+
+        class noise:
+            scale = 1.0
+            dof_pos_obs = 0.01
+            base_ang_vel = 0.01
+            dof_pos = 0.005
+            dof_vel = 0.005
+            lin_vel = 0.05
+            ang_vel = [0.3, 0.15, 0.4]
+            gravity_vec = 0.1
+
+    class critic:
+        hidden_dims = [256, 256, 128]
+        # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
+        activation = "elu"
+        normalize_obs = True
+        obs = [
+            "base_height",
+            "base_lin_vel",
+            "base_ang_vel",
+            "projected_gravity",
+            "commands",
+            "dof_pos_obs",
+            "dof_vel",
+            "phase_obs",
+            "dof_pos_target",
+        ]
+
+        class reward:
+            class weights:
+                tracking_lin_vel = 2.0
+                tracking_ang_vel = 1.0
+                orientation = 1.0
+                min_base_height = 1.5
+                action_rate = 0.001
+                action_rate2 = 0.001
+                stand_still = 1.0
+                swing_grf = 1.0
+                stance_grf = 1.0
+
+            class termination_weight:
+                termination = 0.15
+
+    class algorithm(MiniCheetahRefRunnerCfg.algorithm):
+        desired_kl = 0.02  # 0.02 for smooth-exploration, else 0.01
+
+        # IPG
+        polyak = 0.995
+        use_cv = False
+        inter_nu = 0.5
+        beta = "off_policy"
+        storage_size = 4 * 32 * 4096  # num_policies*num_stpes*num_envs
+
+        # Finetuning
+        learning_rate = 3e-4
+        max_gradient_steps = 5
+
+    class runner(MiniCheetahRefRunnerCfg.runner):
+        run_name = ""
+        experiment_name = "mini_cheetah_ref"
+        max_iterations = 20  # number of policy updates
+        algorithm_class_name = "PPO2"
+        num_steps_per_env = 3000
+
+        # Finetuning
+        resume = True
+        load_run = "Jul13_01-49-59_PPO32_S16"
+        checkpoint = 700
+        save_interval = 5
diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py
index fdff23b4..69b9869f 100644
--- a/learning/runners/finetune_runner.py
+++ b/learning/runners/finetune_runner.py
@@ -1,11 +1,25 @@
 import torch
+from tensordict import TensorDict
 from learning.algorithms import *  # noqa: F403
 from learning.modules import Actor, SmoothActor, Critic
 from learning.utils import remove_zero_weighted_rewards
+from learning.storage import DictStorage
 
+from .BaseRunner import BaseRunner
 
-class FineTuneRunner:
-    def __init__(self, train_cfg, data_dict, exploration_scale=1.0, device="cpu"):
+sim_storage = DictStorage()
+
+
+class FineTuneRunner(BaseRunner):
+    def __init__(
+        self,
+        env,
+        train_cfg,
+        data_dict,
+        exploration_scale=1.0,
+        device="cpu",
+    ):
+        self.env = env
         self.parse_train_cfg(train_cfg)
         self.data_dict = data_dict
         self.exploration_scale = exploration_scale
@@ -52,6 +66,21 @@ def parse_train_cfg(self, train_cfg):
         self.critic_cfg = train_cfg["critic"]
 
     def learn(self):
+        self.alg.switch_to_train()
+
+        if self.env is not None:
+            # Simulate 1 env, same number of steps as in data dict
+            num_steps = self.data_dict.shape[0]
+            sim_data = self.get_sim_data(num_steps)
+            # Concatenate data dict wtih sim data
+            self.data_dict = TensorDict(
+                {
+                    name: torch.cat((self.data_dict[name], sim_data[name]), dim=1)
+                    for name in self.data_dict.keys()
+                },
+                batch_size=(num_steps, 2),
+            )
+
         # Single update on data dict
         if self.cfg["algorithm_class_name"] == "PPO_IPG":
             # TODO: How to handle off-policy data
@@ -59,6 +88,100 @@ def learn(self):
         else:
             self.alg.update(self.data_dict)
 
+    def get_sim_data(self, num_steps):
+        rewards_dict = {}
+        actor_obs = self.get_obs(self.actor_cfg["obs"])
+        critic_obs = self.get_obs(self.critic_cfg["obs"])
+
+        # * Initialize smooth exploration matrices
+        if self.actor_cfg["smooth_exploration"]:
+            self.alg.actor.sample_weights(batch_size=1)
+
+        # * Start up storage
+        transition = TensorDict({}, batch_size=1, device=self.device)
+        transition.update(
+            {
+                "actor_obs": actor_obs,
+                "next_actor_obs": actor_obs,
+                "actions": self.alg.act(actor_obs),
+                "critic_obs": critic_obs,
+                "next_critic_obs": critic_obs,
+                "rewards": self.get_rewards({"termination": 0.0})["termination"],
+                "dones": self.get_timed_out(),
+            }
+        )
+        sim_storage.initialize(
+            transition,
+            num_envs=1,
+            max_storage=num_steps,
+            device=self.device,
+        )
+
+        # * Rollout
+        with torch.inference_mode():
+            for i in range(num_steps):
+                # * Re-sample noise matrix for smooth exploration
+                sample_freq = self.actor_cfg["exploration_sample_freq"]
+                if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
+                    self.alg.actor.sample_weights(batch_size=1)
+
+                actions = self.alg.act(actor_obs)
+                self.set_actions(
+                    self.actor_cfg["actions"],
+                    actions,
+                    self.actor_cfg["disable_actions"],
+                )
+
+                transition.update(
+                    {
+                        "actor_obs": actor_obs,
+                        "actions": actions,
+                        "critic_obs": critic_obs,
+                    }
+                )
+
+                self.env.step()
+
+                actor_obs = self.get_noisy_obs(
+                    self.actor_cfg["obs"], self.actor_cfg["noise"]
+                )
+                critic_obs = self.get_obs(self.critic_cfg["obs"])
+
+                # * get time_outs
+                timed_out = self.get_timed_out()
+                terminated = self.get_terminated()
+                dones = timed_out | terminated
+
+                self.update_rewards(rewards_dict, terminated)
+                total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0)
+
+                transition.update(
+                    {
+                        "next_actor_obs": actor_obs,
+                        "next_critic_obs": critic_obs,
+                        "rewards": total_rewards,
+                        "timed_out": timed_out,
+                        "dones": dones,
+                    }
+                )
+                sim_storage.add_transitions(transition)
+
+        return sim_storage.data
+
+    def update_rewards(self, rewards_dict, terminated):
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["termination_weight"], mask=terminated
+            )
+        )
+        rewards_dict.update(
+            self.get_rewards(
+                self.critic_cfg["reward"]["weights"],
+                modifier=self.env.dt,
+                mask=~terminated,
+            )
+        )
+
     def save(self, path):
         if self.cfg["algorithm_class_name"] == "PPO_IPG":
             torch.save(
diff --git a/scripts/finetune.py b/scripts/finetune.py
index ab97b5bd..42a33739 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -16,21 +16,20 @@
 from tensordict import TensorDict
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
-LOAD_RUN = "Jul13_01-49-59_PPO32_S16"
-# LOAD_RUN = "Jul12_15-53-57_IPG32_S16"
-LOG_FILE = "PPO_700_expl08.mat"
-ITERATION = 700  # load this iteration
 
+USE_SIMULATOR = False
 # IPG_BUFFER = [
 #     "IPG_700_expl08.mat",
 #     "IPG_701_expl08.mat",
 # ]
 
-LOG_REWARDS = False  # whether to log to dataframe
+LOG_REWARDS = False
 REWARDS_FILE = "rewards_test.csv"
 
-TRAJ_LENGTH = 100  # split data into chunks of this length
-EXPLORATION_SCALE = 0.8  # scale std in SmoothActor
+# Scales
+EXPLORATION_SCALE = 0.8  # used during data collection
+ACTION_SCALES = np.tile(np.array([0.2, 0.3, 0.3]), 4)
+COMMAND_SCALES = np.array([3.0, 1.0, 3.0])
 
 # Data struct fields from Robot-Software logs
 DATA_LIST = [
@@ -50,20 +49,7 @@
     "footer",
 ]
 
-# Finetune for these rewards instead of the ones in the cfg
-REWARD_WEIGHTS = {
-    "min_base_height": 1.5,
-    # "tracking_lin_vel": 4.0,
-    # "tracking_ang_vel": 2.0,
-    "orientation": 1.0,
-    "swing_grf": 1.0,
-    "stance_grf": 1.0,
-    "stand_still": 1.0,
-    "action_rate": 0.001,
-    "action_rate2": 0.001,
-}
-
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+DEVICE = "cuda"
 
 
 def get_obs(obs_list, data_struct, batch_size):
@@ -77,18 +63,18 @@ def get_obs(obs_list, data_struct, batch_size):
     return obs_all.float()
 
 
-def get_rewards(data_struct, batch_size):
-    mini_cheetah = MinimalistCheetah(device=DEVICE)
-    rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()}  # for plotting
+def get_rewards(data_struct, reward_weights, batch_size):
+    minimalist_cheetah = MinimalistCheetah(device=DEVICE)
+    rewards_dict = {name: [] for name in reward_weights.keys()}  # for plotting
     rewards_all = torch.empty(0).to(DEVICE)
 
     for i in range(batch_size):
-        mini_cheetah.set_states(
+        minimalist_cheetah.set_states(
             base_height=data_struct[1][:, i],  # shape (1, batch_size)
             base_lin_vel=data_struct[2][i],
             base_ang_vel=data_struct[3][i],
             proj_gravity=data_struct[4][i],
-            commands=data_struct[5][i],
+            commands=data_struct[5][i] * COMMAND_SCALES,
             dof_pos_obs=data_struct[6][i],
             dof_vel=data_struct[7][i],
             phase_obs=data_struct[8][i],
@@ -96,43 +82,47 @@ def get_rewards(data_struct, batch_size):
             dof_pos_target=data_struct[10][i],
         )
         total_rewards = 0
-        for name, weight in REWARD_WEIGHTS.items():
-            reward = weight * eval(f"mini_cheetah._reward_{name}()")
+        for name, weight in reward_weights.items():
+            reward = weight * eval(f"minimalist_cheetah._reward_{name}()")
             rewards_dict[name].append(reward.item())
             total_rewards += reward
         rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
         # Post process mini cheetah
-        mini_cheetah.post_process()
+        minimalist_cheetah.post_process()
 
     rewards_dict["total"] = rewards_all.tolist()
 
-    if LOG_REWARDS:
-        rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
-        if not os.path.exists(rewards_path):
-            rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"])
-        else:
-            rewards_df = pd.read_csv(rewards_path)
-        for name, rewards in rewards_dict.items():
-            rewards = np.array(rewards)
-            mean = rewards.mean()
-            std = rewards.std()
-            rewards_df = rewards_df._append(
-                {
-                    "iteration": str(ITERATION) + "_expl",
-                    "type": name,
-                    "mean": mean,
-                    "std": std,
-                },
-                ignore_index=True,
-            )
-        rewards_df.to_csv(rewards_path, index=False)
-        print(rewards_df)
-
-    return rewards_all.float()
+    return rewards_all.float(), rewards_dict
+
+
+def log_rewards(rewards_dict, path, checkpoint):
+    if not os.path.exists(path):
+        rewards_df = pd.DataFrame(columns=["checkpoint", "type", "mean", "std"])
+    else:
+        rewards_df = pd.read_csv(path)
+    for name, rewards in rewards_dict.items():
+        rewards = np.array(rewards)
+        mean = rewards.mean()
+        std = rewards.std()
+        rewards_df = rewards_df._append(
+            {
+                "checkpoint": str(checkpoint),
+                "type": name,
+                "mean": mean,
+                "std": std,
+            },
+            ignore_index=True,
+        )
+    rewards_df.to_csv(path, index=False)
+    print(rewards_df)
 
 
 def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"):
-    path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE)
+    load_run = train_cfg["runner"]["load_run"]
+    checkpoint = train_cfg["runner"]["checkpoint"]
+    log_file = str(checkpoint) + ".mat"
+
+    path = os.path.join(ROOT_DIR, load_run, log_file)
     data = scipy.io.loadmat(path)
     data_struct = data[name][0][0]
     batch_size = data_struct[1].shape[1]  # base_height: shape (1, batch_size)
@@ -154,13 +144,17 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"):
     actions = actions.reshape((batch_size, 1, -1))  # shape (batch_size, 1, n)
     data_dict["actions"] = actions[:-1]
 
-    rewards = get_rewards(data_struct, batch_size)
+    reward_weights = train_cfg["critic"]["reward"]["weights"]
+    rewards, rewards_dict = get_rewards(data_struct, reward_weights, batch_size)
     rewards = rewards.reshape((batch_size, 1))  # shape (batch_size, 1)
     data_dict["rewards"] = rewards[:-1]
 
-    # Assume no termination
+    if LOG_REWARDS:
+        rewards_path = os.path.join(ROOT_DIR, load_run, REWARDS_FILE)
+        log_rewards(rewards_dict, rewards_path, checkpoint)
+
+    # No time outs and terminations
     data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
-    data_dict["terminal"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
     data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
 
     return data_dict
@@ -169,11 +163,19 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"):
 def setup():
     args = get_args()
 
-    _, train_cfg = task_registry.create_cfgs(args)
+    env_cfg, train_cfg = task_registry.create_cfgs(args)
+    task_registry.make_gym_and_sim()
+
+    if USE_SIMULATOR:
+        env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
+    else:
+        env = None
+
     train_cfg = class_to_dict(train_cfg)
     data_dict = get_data_dict(train_cfg)
 
     runner = FineTuneRunner(
+        env,
         train_cfg,
         data_dict,
         exploration_scale=EXPLORATION_SCALE,
@@ -185,7 +187,9 @@ def setup():
 
 
 def finetune(runner):
-    load_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION) + ".pt")
+    load_run = runner.cfg["load_run"]
+    checkpoint = runner.cfg["checkpoint"]
+    load_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt")
     print("Loading model from: ", load_path)
     runner.load(load_path)
 
@@ -193,18 +197,19 @@ def finetune(runner):
     runner.learn()
 
     # Compare old and new actions
-    scaling = torch.tensor([0.2, 0.3, 0.3], device=DEVICE).repeat(4)
     actions_old = runner.data_dict["actions"]
-    actions_new = scaling * runner.alg.actor.act_inference(
+    action_scales = torch.tensor(ACTION_SCALES).to(DEVICE)
+    actions_new = action_scales * runner.alg.actor.act_inference(
         runner.data_dict["actor_obs"]
     )
     diff = actions_new - actions_old
     print("Mean action diff per actuator: ", diff.mean(dim=(0, 1)))
+    print("Overall mean action diff: ", diff.mean())
 
-    save_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION + 1) + ".pt")
+    save_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint + 1) + ".pt")
     runner.save(save_path)
 
-    export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_" + str(ITERATION + 1))
+    export_path = os.path.join(ROOT_DIR, load_run, "exported_" + str(checkpoint + 1))
     runner.export(export_path)
 
 

From b01b0a3b56e8d99dd8a8369f8b0096364331069a Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Tue, 16 Jul 2024 18:48:14 -0400
Subject: [PATCH 32/42] script to finetune multiple runs with robot-software

---
 scripts/finetune_multiple.sh | 49 ++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 scripts/finetune_multiple.sh

diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh
new file mode 100644
index 00000000..81b26056
--- /dev/null
+++ b/scripts/finetune_multiple.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+source /home/lmolnar/miniconda3/etc/profile.d/conda.sh
+
+# Args
+LOAD_RUN="Jul13_01-49-59_PPO32_S16"
+CHECKPOINT=700
+N_RUNS=5
+
+# Set directories
+QGYM_DIR="/home/lmolnar/workspace/QGym"
+QGYM_LOG_DIR="${QGYM_DIR}/logs/mini_cheetah_ref/${LOAD_RUN}"
+
+RS_DIR="/home/lmolnar/workspace/Robot-Software"
+RS_POLICY_DIR="${RS_DIR}/config/systems/quadruped/controllers/policies"
+
+# Change name of export folder
+mv ${QGYM_LOG_DIR}/exported ${QGYM_LOG_DIR}/exported_${CHECKPOINT}
+
+for i in {1..N_RUNS}
+do
+    # Copy exported files to RS
+    cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_POLICY_DIR}
+
+    # Store LCM logs in file labeled with checkpoint
+    FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT}
+    if [ -f "$FILE" ]; then
+        rm "$FILE"
+    fi
+
+    # Run logging script in background
+    ${RS_DIR}/logging/scripts/run_lcm_logger.sh ${CHECKPOINT} &
+    PID1=$!
+
+    # Run quadruped script and cancel logging script when done
+    ${RS_DIR}/build/bin/run_quadruped m s
+    kill $PID1
+
+    # Convert logs to .mat and copy to QGym
+    conda deactivate
+    conda activate robot-sw
+    ${RS_DIR}/logging/scripts/sim_data_convert.sh -l
+    cp ${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat ${QGYM_LOG_DIR}
+
+    # Finetune in QGym
+    conda deactivate
+    conda activate qgym
+    python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT}
+    CHECKPOINT=$((CHECKPOINT + 1))
+done
\ No newline at end of file

From 77f5bd48b728b16ca368466819f3829f4bd0a5b0 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Tue, 16 Jul 2024 19:43:17 -0400
Subject: [PATCH 33/42] bugfix finetune script, update rewards evaluation

---
 scripts/eval_rewards.py      | 44 ++++++++++++++++++++----------------
 scripts/finetune_multiple.sh | 18 +++++++--------
 2 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py
index 15b287f3..1e4d6c50 100644
--- a/scripts/eval_rewards.py
+++ b/scripts/eval_rewards.py
@@ -11,10 +11,8 @@
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
 LOAD_RUN = "Jul13_01-49-59_PPO32_S16"
-LOG_FILE = "PPO_701_a001.mat"
-REWARDS_FILE = "rewards_a001.csv"
-
-ITERATION = 701  # load this iteration
+REWARDS_FILE = "rewards.csv"
+ITERATIONS = range(700, 705)
 
 PLOT = True
 PLOT_N = 1000  # number of steps to plot
@@ -37,7 +35,6 @@
     "footer",
 ]
 
-# Finetune for these rewards instead of the ones in the cfg
 REWARD_WEIGHTS = {
     "min_base_height": 1.5,
     # "tracking_lin_vel": 4.0,
@@ -50,10 +47,10 @@
     "action_rate2": 0.001,
 }
 
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+DEVICE = "cuda"
 
 
-def get_rewards(data_struct):
+def get_rewards(it, data_struct, rewards_path):
     data_length = data_struct[1].shape[1]  # base_height: shape (1, data_length)
 
     mini_cheetah = MinimalistCheetah(device=DEVICE)
@@ -85,7 +82,6 @@ def get_rewards(data_struct):
     rewards_dict["total"] = rewards_all.tolist()
 
     # Save rewards in dataframe
-    rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
     if not os.path.exists(rewards_path):
         rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"])
     else:
@@ -96,7 +92,7 @@ def get_rewards(data_struct):
         std = rewards.std()
         rewards_df = rewards_df._append(
             {
-                "iteration": ITERATION,
+                "iteration": it,
                 "type": name,
                 "mean": mean,
                 "std": std,
@@ -104,23 +100,31 @@ def get_rewards(data_struct):
             ignore_index=True,
         )
         if PLOT:
+            plt.figure(it)
             plt.plot(rewards[:PLOT_N], label=name)
+            plt.title(f"Rewards Iteration {it}")
+            plt.legend()
+            plt.savefig(f"{ROOT_DIR}/{LOAD_RUN}/rewards_{it}.png")
     rewards_df.to_csv(rewards_path, index=False)
-    print(rewards_df)
 
 
 def setup(name="SMOOTH_RL_CONTROLLER"):
-    path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE)
-    data = scipy.io.loadmat(path)
-    data_struct = data[name][0][0]
-    return data_struct
+    data_dict = {}
+    for it in ITERATIONS:
+        path = os.path.join(ROOT_DIR, LOAD_RUN, f"{it}.mat")
+        data = scipy.io.loadmat(path)
+        data_dict[it] = data[name][0][0]
+    return data_dict
 
 
 if __name__ == "__main__":
-    data_struct = setup()
-    get_rewards(data_struct)
+    data_dict = setup()
+    rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
+    if os.path.exists(rewards_path):
+        os.remove(rewards_path)
+
+    for it, data_struct in data_dict.items():
+        get_rewards(it, data_struct, rewards_path)
 
-    if PLOT:
-        plt.title("Rewards")
-        plt.legend()
-        plt.show()
+    rewards_df = pd.read_csv(rewards_path)
+    print(rewards_df)
diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh
index 81b26056..9a214044 100644
--- a/scripts/finetune_multiple.sh
+++ b/scripts/finetune_multiple.sh
@@ -8,19 +8,16 @@ N_RUNS=5
 
 # Set directories
 QGYM_DIR="/home/lmolnar/workspace/QGym"
-QGYM_LOG_DIR="${QGYM_DIR}/logs/mini_cheetah_ref/${LOAD_RUN}"
-
 RS_DIR="/home/lmolnar/workspace/Robot-Software"
-RS_POLICY_DIR="${RS_DIR}/config/systems/quadruped/controllers/policies"
 
-# Change name of export folder
+QGYM_LOG_DIR="${QGYM_DIR}/logs/mini_cheetah_ref/${LOAD_RUN}"
+
+# Change default folder name and copy to RS
 mv ${QGYM_LOG_DIR}/exported ${QGYM_LOG_DIR}/exported_${CHECKPOINT}
+cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_DIR}/config/systems/quadruped/controllers/policies
 
-for i in {1..N_RUNS}
+for i in $(seq 1 $N_RUNS)
 do
-    # Copy exported files to RS
-    cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_POLICY_DIR}
-
     # Store LCM logs in file labeled with checkpoint
     FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT}
     if [ -f "$FILE" ]; then
@@ -38,12 +35,15 @@ do
     # Convert logs to .mat and copy to QGym
     conda deactivate
     conda activate robot-sw
-    ${RS_DIR}/logging/scripts/sim_data_convert.sh -l
+    ${RS_DIR}/logging/scripts/sim_data_convert.sh ${CHECKPOINT}
     cp ${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat ${QGYM_LOG_DIR}
 
     # Finetune in QGym
     conda deactivate
     conda activate qgym
     python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT}
+
+    # Copy policy to RS
     CHECKPOINT=$((CHECKPOINT + 1))
+    cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_DIR}/config/systems/quadruped/controllers/policies
 done
\ No newline at end of file

From 68fde3ea0751ab0d34b5c1e57d6edae753c01e2c Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 17 Jul 2024 10:38:13 -0400
Subject: [PATCH 34/42] build off-policy buffer for IPG from RS data

---
 .../mini_cheetah_finetune_config.py           |   8 +-
 learning/runners/finetune_runner.py           |  31 ++---
 scripts/finetune.py                           | 120 +++++++++++-------
 scripts/finetune_multiple.sh                  |   2 +-
 4 files changed, 92 insertions(+), 69 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
index e07ee45a..284fcd0a 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
@@ -57,7 +57,7 @@ class scaling(MiniCheetahRefCfg.scaling):
 
 class MiniCheetahFineTuneRunnerCfg(MiniCheetahRefRunnerCfg):
     seed = -1
-    runner_class_name = "OnPolicyRunner"
+    runner_class_name = "IPGRunner"
 
     class actor:
         hidden_dims = [256, 256, 128]
@@ -132,18 +132,18 @@ class algorithm(MiniCheetahRefRunnerCfg.algorithm):
         storage_size = 4 * 32 * 4096  # num_policies*num_stpes*num_envs
 
         # Finetuning
-        learning_rate = 3e-4
+        learning_rate = 1e-4
         max_gradient_steps = 5
 
     class runner(MiniCheetahRefRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
         max_iterations = 20  # number of policy updates
-        algorithm_class_name = "PPO2"
+        algorithm_class_name = "PPO_IPG"
         num_steps_per_env = 3000
 
         # Finetuning
         resume = True
-        load_run = "Jul13_01-49-59_PPO32_S16"
+        load_run = "Jul12_15-53-57_IPG32_S16"
         checkpoint = 700
         save_interval = 5
diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py
index 69b9869f..064073b3 100644
--- a/learning/runners/finetune_runner.py
+++ b/learning/runners/finetune_runner.py
@@ -15,20 +15,22 @@ def __init__(
         self,
         env,
         train_cfg,
-        data_dict,
+        data_onpol,
+        data_offpol=None,
         exploration_scale=1.0,
         device="cpu",
     ):
         self.env = env
         self.parse_train_cfg(train_cfg)
-        self.data_dict = data_dict
+        self.data_onpol = data_onpol
+        self.data_offpol = data_offpol
         self.exploration_scale = exploration_scale
         self.device = device
 
     def _set_up_alg(self):
-        num_actor_obs = self.data_dict["actor_obs"].shape[-1]
-        num_actions = self.data_dict["actions"].shape[-1]
-        num_critic_obs = self.data_dict["critic_obs"].shape[-1]
+        num_actor_obs = self.data_onpol["actor_obs"].shape[-1]
+        num_actions = self.data_onpol["actions"].shape[-1]
+        num_critic_obs = self.data_onpol["critic_obs"].shape[-1]
         if self.actor_cfg["smooth_exploration"]:
             actor = SmoothActor(
                 num_obs=num_actor_obs,
@@ -69,24 +71,23 @@ def learn(self):
         self.alg.switch_to_train()
 
         if self.env is not None:
-            # Simulate 1 env, same number of steps as in data dict
-            num_steps = self.data_dict.shape[0]
+            # Simulate 1 env, same number of steps as in data
+            num_steps = self.data_onpol.shape[0]
             sim_data = self.get_sim_data(num_steps)
             # Concatenate data dict wtih sim data
-            self.data_dict = TensorDict(
+            self.data_onpol = TensorDict(
                 {
-                    name: torch.cat((self.data_dict[name], sim_data[name]), dim=1)
-                    for name in self.data_dict.keys()
+                    name: torch.cat((self.data_onpol[name], sim_data[name]), dim=1)
+                    for name in self.data_onpol.keys()
                 },
                 batch_size=(num_steps, 2),
             )
 
-        # Single update on data dict
-        if self.cfg["algorithm_class_name"] == "PPO_IPG":
-            # TODO: How to handle off-policy data
-            self.alg.update(self.data_dict, self.data_dict)
+        # Single alg update on data
+        if self.data_offpol is None:
+            self.alg.update(self.data_onpol)
         else:
-            self.alg.update(self.data_dict)
+            self.alg.update(self.data_onpol, self.data_offpol)
 
     def get_sim_data(self, num_steps):
         rewards_dict = {}
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 42a33739..9bdc7add 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -18,11 +18,6 @@
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
 
 USE_SIMULATOR = False
-# IPG_BUFFER = [
-#     "IPG_700_expl08.mat",
-#     "IPG_701_expl08.mat",
-# ]
-
 LOG_REWARDS = False
 REWARDS_FILE = "rewards_test.csv"
 
@@ -52,25 +47,25 @@
 DEVICE = "cuda"
 
 
-def get_obs(obs_list, data_struct, batch_size):
+def get_obs(obs_list, data_struct, data_length):
     obs_all = torch.empty(0).to(DEVICE)
     for obs_name in obs_list:
         data_idx = DATA_LIST.index(obs_name)
         obs = torch.tensor(data_struct[data_idx]).to(DEVICE)
-        obs = obs.reshape((batch_size, 1, -1))  # shape (batch_size, 1, n)
+        obs = obs.reshape((data_length, 1, -1))  # shape (data_length, 1, n)
         obs_all = torch.cat((obs_all, obs), dim=-1)
 
     return obs_all.float()
 
 
-def get_rewards(data_struct, reward_weights, batch_size):
+def get_rewards(data_struct, reward_weights, data_length):
     minimalist_cheetah = MinimalistCheetah(device=DEVICE)
     rewards_dict = {name: [] for name in reward_weights.keys()}  # for plotting
     rewards_all = torch.empty(0).to(DEVICE)
 
-    for i in range(batch_size):
+    for i in range(data_length):
         minimalist_cheetah.set_states(
-            base_height=data_struct[1][:, i],  # shape (1, batch_size)
+            base_height=data_struct[1][:, i],  # shape (1, data_length)
             base_lin_vel=data_struct[2][i],
             base_ang_vel=data_struct[3][i],
             proj_gravity=data_struct[4][i],
@@ -117,45 +112,63 @@ def log_rewards(rewards_dict, path, checkpoint):
     print(rewards_df)
 
 
-def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"):
+def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False):
     load_run = train_cfg["runner"]["load_run"]
     checkpoint = train_cfg["runner"]["checkpoint"]
-    log_file = str(checkpoint) + ".mat"
-
-    path = os.path.join(ROOT_DIR, load_run, log_file)
-    data = scipy.io.loadmat(path)
-    data_struct = data[name][0][0]
-    batch_size = data_struct[1].shape[1]  # base_height: shape (1, batch_size)
-    data_dict = TensorDict(
-        {},
-        device=DEVICE,
-        batch_size=(batch_size - 1, 1),  # -1 for next_obs
-    )
-    actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, batch_size)
-    critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, batch_size)
-
-    data_dict["actor_obs"] = actor_obs[:-1]
-    data_dict["next_actor_obs"] = actor_obs[1:]
-    data_dict["critic_obs"] = critic_obs[:-1]
-    data_dict["next_critic_obs"] = critic_obs[1:]
-
-    actions_idx = DATA_LIST.index("dof_pos_target")
-    actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float()
-    actions = actions.reshape((batch_size, 1, -1))  # shape (batch_size, 1, n)
-    data_dict["actions"] = actions[:-1]
+    run_dir = os.path.join(ROOT_DIR, load_run)
 
-    reward_weights = train_cfg["critic"]["reward"]["weights"]
-    rewards, rewards_dict = get_rewards(data_struct, reward_weights, batch_size)
-    rewards = rewards.reshape((batch_size, 1))  # shape (batch_size, 1)
-    data_dict["rewards"] = rewards[:-1]
-
-    if LOG_REWARDS:
-        rewards_path = os.path.join(ROOT_DIR, load_run, REWARDS_FILE)
-        log_rewards(rewards_dict, rewards_path, checkpoint)
+    if offpol:
+        log_files = [file for file in os.listdir(run_dir) if file.endswith(".mat")]
+        log_files = sorted(log_files)
+    else:
+        # Single log file
+        log_files = [str(checkpoint) + ".mat"]
 
-    # No time outs and terminations
-    data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
-    data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool)
+    # Initialize data dict
+    data = scipy.io.loadmat(os.path.join(run_dir, log_files[0]))
+    data_struct = data[name][0][0]
+    data_length = data_struct[1].shape[1]  # base_height: shape (1, data_length)
+    batch_size = (data_length - 1, len(log_files))  # -1 for next_obs
+    data_dict = TensorDict({}, device=DEVICE, batch_size=batch_size)
+
+    # Get all data
+    actor_obs_all = torch.empty(0).to(DEVICE)
+    critic_obs_all = torch.empty(0).to(DEVICE)
+    actions_all = torch.empty(0).to(DEVICE)
+    rewards_all = torch.empty(0).to(DEVICE)
+    for log in log_files:
+        data = scipy.io.loadmat(os.path.join(run_dir, log))
+        data_struct = data[name][0][0]
+
+        actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, data_length)
+        critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, data_length)
+        actor_obs_all = torch.cat((actor_obs_all, actor_obs), dim=1)
+        critic_obs_all = torch.cat((critic_obs_all, critic_obs), dim=1)
+
+        actions_idx = DATA_LIST.index("dof_pos_target")
+        actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float()
+        actions = actions.reshape((data_length, 1, -1))  # shape (data_length, 1, n)
+        actions_all = torch.cat((actions_all, actions), dim=1)
+
+        reward_weights = train_cfg["critic"]["reward"]["weights"]
+        rewards, rewards_dict = get_rewards(data_struct, reward_weights, data_length)
+        rewards = rewards.reshape((data_length, 1))  # shape (data_length, 1)
+        rewards_all = torch.cat((rewards_all, rewards), dim=1)
+
+        if LOG_REWARDS:
+            rewards_path = os.path.join(ROOT_DIR, load_run, REWARDS_FILE)
+            log_rewards(rewards_dict, rewards_path, checkpoint)
+
+    data_dict["actor_obs"] = actor_obs_all[:-1]
+    data_dict["next_actor_obs"] = actor_obs_all[1:]
+    data_dict["critic_obs"] = critic_obs_all[:-1]
+    data_dict["next_critic_obs"] = critic_obs_all[1:]
+    data_dict["actions"] = actions_all[:-1]
+    data_dict["rewards"] = rewards_all[:-1]
+
+    # No time outs and dones
+    data_dict["timed_out"] = torch.zeros(batch_size, device=DEVICE, dtype=bool)
+    data_dict["dones"] = torch.zeros(batch_size, device=DEVICE, dtype=bool)
 
     return data_dict
 
@@ -172,12 +185,21 @@ def setup():
         env = None
 
     train_cfg = class_to_dict(train_cfg)
-    data_dict = get_data_dict(train_cfg)
+    data_onpol = get_data_dict(train_cfg, offpol=False)
+
+    if train_cfg["runner"]["algorithm_class_name"] == "PPO_IPG":
+        data_offpol = get_data_dict(train_cfg, offpol=True)
+    else:
+        data_offpol = None
+
+    print(data_onpol)
+    print(data_offpol)
 
     runner = FineTuneRunner(
         env,
         train_cfg,
-        data_dict,
+        data_onpol,
+        data_offpol,
         exploration_scale=EXPLORATION_SCALE,
         device=DEVICE,
     )
@@ -197,10 +219,10 @@ def finetune(runner):
     runner.learn()
 
     # Compare old and new actions
-    actions_old = runner.data_dict["actions"]
+    actions_old = runner.data_onpol["actions"]
     action_scales = torch.tensor(ACTION_SCALES).to(DEVICE)
     actions_new = action_scales * runner.alg.actor.act_inference(
-        runner.data_dict["actor_obs"]
+        runner.data_onpol["actor_obs"]
     )
     diff = actions_new - actions_old
     print("Mean action diff per actuator: ", diff.mean(dim=(0, 1)))
diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh
index 9a214044..fa81966f 100644
--- a/scripts/finetune_multiple.sh
+++ b/scripts/finetune_multiple.sh
@@ -2,7 +2,7 @@
 source /home/lmolnar/miniconda3/etc/profile.d/conda.sh
 
 # Args
-LOAD_RUN="Jul13_01-49-59_PPO32_S16"
+LOAD_RUN="Jul12_15-53-57_IPG32_S16"
 CHECKPOINT=700
 N_RUNS=5
 

From 5e56d54fb543b3feb9da3889b25e1ce445b71207 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 17 Jul 2024 11:24:31 -0400
Subject: [PATCH 35/42] remove rewards logging in finetune.py

---
 scripts/finetune.py | 37 ++-----------------------------------
 1 file changed, 2 insertions(+), 35 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 9bdc7add..51f165cf 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -12,14 +12,11 @@
 import torch
 import scipy.io
 import numpy as np
-import pandas as pd
 from tensordict import TensorDict
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
 
 USE_SIMULATOR = False
-LOG_REWARDS = False
-REWARDS_FILE = "rewards_test.csv"
 
 # Scales
 EXPLORATION_SCALE = 0.8  # used during data collection
@@ -60,7 +57,6 @@ def get_obs(obs_list, data_struct, data_length):
 
 def get_rewards(data_struct, reward_weights, data_length):
     minimalist_cheetah = MinimalistCheetah(device=DEVICE)
-    rewards_dict = {name: [] for name in reward_weights.keys()}  # for plotting
     rewards_all = torch.empty(0).to(DEVICE)
 
     for i in range(data_length):
@@ -79,37 +75,12 @@ def get_rewards(data_struct, reward_weights, data_length):
         total_rewards = 0
         for name, weight in reward_weights.items():
             reward = weight * eval(f"minimalist_cheetah._reward_{name}()")
-            rewards_dict[name].append(reward.item())
             total_rewards += reward
         rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
         # Post process mini cheetah
         minimalist_cheetah.post_process()
 
-    rewards_dict["total"] = rewards_all.tolist()
-
-    return rewards_all.float(), rewards_dict
-
-
-def log_rewards(rewards_dict, path, checkpoint):
-    if not os.path.exists(path):
-        rewards_df = pd.DataFrame(columns=["checkpoint", "type", "mean", "std"])
-    else:
-        rewards_df = pd.read_csv(path)
-    for name, rewards in rewards_dict.items():
-        rewards = np.array(rewards)
-        mean = rewards.mean()
-        std = rewards.std()
-        rewards_df = rewards_df._append(
-            {
-                "checkpoint": str(checkpoint),
-                "type": name,
-                "mean": mean,
-                "std": std,
-            },
-            ignore_index=True,
-        )
-    rewards_df.to_csv(path, index=False)
-    print(rewards_df)
+    return rewards_all.float()
 
 
 def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False):
@@ -151,14 +122,10 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False):
         actions_all = torch.cat((actions_all, actions), dim=1)
 
         reward_weights = train_cfg["critic"]["reward"]["weights"]
-        rewards, rewards_dict = get_rewards(data_struct, reward_weights, data_length)
+        rewards = get_rewards(data_struct, reward_weights, data_length)
         rewards = rewards.reshape((data_length, 1))  # shape (data_length, 1)
         rewards_all = torch.cat((rewards_all, rewards), dim=1)
 
-        if LOG_REWARDS:
-            rewards_path = os.path.join(ROOT_DIR, load_run, REWARDS_FILE)
-            log_rewards(rewards_dict, rewards_path, checkpoint)
-
     data_dict["actor_obs"] = actor_obs_all[:-1]
     data_dict["next_actor_obs"] = actor_obs_all[1:]
     data_dict["critic_obs"] = critic_obs_all[:-1]

From 1c70ce564c6a4e2d84a34b374ce8b8f610a79664 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 17 Jul 2024 18:31:12 -0400
Subject: [PATCH 36/42] merge lm/SAC_plotting branch, visualize PPO and IPG
 critics

---
 gym/envs/base/fixed_robot.py         |  97 ++++++++----------
 gym/envs/base/fixed_robot_config.py  |  52 +++++-----
 gym/envs/cartpole/cartpole_config.py |   2 +-
 gym/envs/pendulum/pendulum.py        |  78 +++++++++++----
 gym/envs/pendulum/pendulum_config.py |  77 +++++++++------
 learning/modules/lqrc/plotting.py    | 142 +++++++++++++++++++++++++++
 learning/runners/ipg_runner.py       |  26 +++--
 learning/runners/on_policy_runner.py |  23 +++--
 scripts/visualize_ipg.py             | 124 +++++++++++++++++++++++
 scripts/visualize_ppo.py             | 102 +++++++++++++++++++
 10 files changed, 575 insertions(+), 148 deletions(-)
 create mode 100644 learning/modules/lqrc/plotting.py
 create mode 100644 scripts/visualize_ipg.py
 create mode 100644 scripts/visualize_ppo.py

diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py
index 1fe4c5d5..6d9dc971 100644
--- a/gym/envs/base/fixed_robot.py
+++ b/gym/envs/base/fixed_robot.py
@@ -40,64 +40,60 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
         self.reset()
 
     def step(self):
-        """Apply actions, simulate, call self.post_physics_step()
-            and pre_physics_step()
-
-        Args:
-            actions (torch.Tensor): Tensor of shape
-                (num_envs, num_actions_per_env)
-        """
-
         self._reset_buffers()
-        self._pre_physics_step()
-        # * step physics and render each frame
+        self._pre_decimation_step()
         self._render()
         for _ in range(self.cfg.control.decimation):
+            self._pre_compute_torques()
             self.torques = self._compute_torques()
-
-            if self.cfg.asset.disable_motors:
-                self.torques[:] = 0.0
-            torques_to_gym_tensor = torch.zeros(
-                self.num_envs, self.num_dof, device=self.device
-            )
-
-            # todo encapsulate
-            next_torques_idx = 0
-            for dof_idx in range(self.num_dof):
-                if self.cfg.control.actuated_joints_mask[dof_idx]:
-                    torques_to_gym_tensor[:, dof_idx] = self.torques[
-                        :, next_torques_idx
-                    ]
-                    next_torques_idx += 1
-                else:
-                    torques_to_gym_tensor[:, dof_idx] = torch.zeros(
-                        self.num_envs, device=self.device
-                    )
-
-            self.gym.set_dof_actuation_force_tensor(
-                self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor)
-            )
-            self.gym.simulate(self.sim)
-            if self.device == "cpu":
-                self.gym.fetch_results(self.sim, True)
-            self.gym.refresh_dof_state_tensor(self.sim)
-
-        self._post_physics_step()
+            self._post_compute_torques()
+            self._step_physx_sim()
+            self._post_physx_step()
+        self._post_decimation_step()
         self._check_terminations_and_timeouts()
 
         env_ids = self.to_be_reset.nonzero(as_tuple=False).flatten()
         self._reset_idx(env_ids)
 
-    def _pre_physics_step(self):
-        pass
+    def _pre_decimation_step(self):
+        return None
+
+    def _pre_compute_torques(self):
+        return None
+
+    def _post_compute_torques(self):
+        if self.cfg.asset.disable_motors:
+            self.torques[:] = 0.0
 
-    def _post_physics_step(self):
+    def _step_physx_sim(self):
+        next_torques_idx = 0
+        torques_to_gym_tensor = torch.zeros(
+            self.num_envs, self.num_dof, device=self.device
+        )
+        for dof_idx in range(self.num_dof):
+            if self.cfg.control.actuated_joints_mask[dof_idx]:
+                torques_to_gym_tensor[:, dof_idx] = self.torques[:, next_torques_idx]
+                next_torques_idx += 1
+            else:
+                torques_to_gym_tensor[:, dof_idx] = torch.zeros(
+                    self.num_envs, device=self.device
+                )
+        self.gym.set_dof_actuation_force_tensor(
+            self.sim, gymtorch.unwrap_tensor(self.torques)
+        )
+        self.gym.simulate(self.sim)
+        if self.device == "cpu":
+            self.gym.fetch_results(self.sim, True)
+        self.gym.refresh_dof_state_tensor(self.sim)
+
+    def _post_physx_step(self):
         """
         check terminations, compute observations and rewards
         """
         self.gym.refresh_actor_root_state_tensor(self.sim)
         self.gym.refresh_net_contact_force_tensor(self.sim)
 
+    def _post_decimation_step(self):
         self.episode_length_buf += 1
         self.common_step_counter += 1
 
@@ -212,18 +208,6 @@ def _process_rigid_body_props(self, props, env_id):
         return props
 
     def _compute_torques(self):
-        """Compute torques from actions.
-            Actions can be interpreted as position or velocity targets given
-            to a PD controller, or directly as scaled torques.
-            [NOTE]: torques must have the same dimension as the number of DOFs,
-                even if some DOFs are not actuated.
-
-        Args:
-            actions (torch.Tensor): Actions
-
-        Returns:
-            [torch.Tensor]: Torques sent to the simulation
-        """
         actuated_dof_pos = torch.zeros(
             self.num_envs, self.num_actuators, device=self.device
         )
@@ -415,10 +399,7 @@ def _init_buffers(self):
         self.default_act_pos = self.default_act_pos.unsqueeze(0)
         # * store indices of actuated joints
         self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device)
-        # * check that init range highs and lows are consistent
-        # * and repopulate to match
-        if self.cfg.init_state.reset_mode == "reset_to_range":
-            self.initialize_ranges_for_initial_conditions()
+        self.initialize_ranges_for_initial_conditions()
 
     def initialize_ranges_for_initial_conditions(self):
         self.dof_pos_range = torch.zeros(
diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py
index 8c7427f8..a9bdef47 100644
--- a/gym/envs/base/fixed_robot_config.py
+++ b/gym/envs/base/fixed_robot_config.py
@@ -123,34 +123,33 @@ class FixedRobotCfgPPO(BaseConfig):
     class logging:
         enable_local_saving = True
 
-    class policy:
+    class actor:
         init_noise_std = 1.0
         hidden_dims = [512, 256, 128]
-        critic_hidden_dims = [512, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-        # only for 'ActorCriticRecurrent':
-        # rnn_type = 'lstm'
-        # rnn_hidden_size = 512
-        # rnn_num_layers = 1
-
         obs = [
             "observation_a",
             "observation_b",
             "these_need_to_be_atributes_(states)_of_the_robot_env",
         ]
-
-        critic_obs = [
-            "observation_x",
-            "observation_y",
-            "critic_obs_can_be_the_same_or_different_than_actor_obs",
-        ]
+        normalize_obs = True
 
         actions = ["tau_ff"]
         disable_actions = False
 
         class noise:
-            noise = 0.1  # implement as needed, also in your robot class
+            observation_a = 0.1  # implement as needed, also in your robot class
+
+    class critic:
+        hidden_dims = [512, 256, 128]
+        activation = "elu"
+        normalize_obs = True
+        obs = [
+            "observation_x",
+            "observation_y",
+            "critic_obs_can_be_the_same_or_different_than_actor_obs",
+        ]
 
         class rewards:
             class weights:
@@ -165,20 +164,25 @@ class termination_weight:
                 termination = 0.0
 
     class algorithm:
-        # * training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
+        # both
+        gamma = 0.99
+        lam = 0.95
+        # shared
+        batch_size = 2**15
+        max_gradient_steps = 10
+        # new
+        storage_size = 2**17  # new
+        batch_size = 2**15  #  new
+
         clip_param = 0.2
-        entropy_coef = 0.01
-        num_learning_epochs = 5
-        # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 4
         learning_rate = 1.0e-3
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
+        entropy_coef = 0.01
         schedule = "adaptive"  # could be adaptive, fixed
-        gamma = 0.99
-        lam = 0.95
         desired_kl = 0.01
-        max_grad_norm = 1.0
 
     class runner:
         policy_class_name = "ActorCritic"
diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py
index f18d5a3b..6e073d5c 100644
--- a/gym/envs/cartpole/cartpole_config.py
+++ b/gym/envs/cartpole/cartpole_config.py
@@ -67,7 +67,7 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO):
     seed = -1
     runner_class_name = "OnPolicyRunner"
 
-    class policy(FixedRobotCfgPPO.policy):
+    class actor(FixedRobotCfgPPO.actor):
         init_noise_std = 1.0
         num_layers = 2
         num_units = 32
diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py
index 95af1fb2..d0bcfb9c 100644
--- a/gym/envs/pendulum/pendulum.py
+++ b/gym/envs/pendulum/pendulum.py
@@ -1,15 +1,49 @@
 import torch
+import numpy as np
+from math import sqrt
 
 from gym.envs.base.fixed_robot import FixedRobot
 
 
 class Pendulum(FixedRobot):
-    def _post_physics_step(self):
-        """Update all states that are not handled in PhysX"""
-        super()._post_physics_step()
+    def _init_buffers(self):
+        super()._init_buffers()
+        self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device)
+
+    def _post_decimation_step(self):
+        super()._post_decimation_step()
+        self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1)
+
+    def _reset_system(self, env_ids):
+        super()._reset_system(env_ids)
+        self.dof_pos_obs[env_ids] = torch.cat(
+            [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1
+        )
+
+    def _check_terminations_and_timeouts(self):
+        super()._check_terminations_and_timeouts()
+        self.terminated = self.timed_out
+
+    def reset_to_uniform(self, env_ids):
+        grid_points = int(sqrt(self.num_envs))
+        lin_pos = torch.linspace(
+            self.dof_pos_range[0, 0],
+            self.dof_pos_range[0, 1],
+            grid_points,
+            device=self.device,
+        )
+        lin_vel = torch.linspace(
+            self.dof_vel_range[0, 0],
+            self.dof_vel_range[0, 1],
+            grid_points,
+            device=self.device,
+        )
+        grid = torch.cartesian_prod(lin_pos, lin_vel)
+        self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1)
+        self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1)
 
     def _reward_theta(self):
-        theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"]
+        theta_rwd = torch.cos(self.dof_pos[:, 0])  # no scaling
         return self._sqrdexp(theta_rwd.squeeze(dim=-1))
 
     def _reward_omega(self):
@@ -17,27 +51,31 @@ def _reward_omega(self):
         return self._sqrdexp(omega_rwd.squeeze(dim=-1))
 
     def _reward_equilibrium(self):
-        error = torch.abs(self.dof_state)
-        error[:, 0] /= self.scales["dof_pos"]
-        error[:, 1] /= self.scales["dof_vel"]
-        return self._sqrdexp(torch.mean(error, dim=1), scale=0.01)
-        # return torch.exp(
-        #     -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma
-        # )
-
-    def _reward_torques(self):
-        """Penalize torques"""
-        return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2)
+        theta_norm = self._normalize_theta()
+        omega = self.dof_vel[:, 0]
+        error = torch.stack(
+            [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1
+        )
+        return self._sqrdexp(torch.mean(error, dim=1), scale=0.2)
 
     def _reward_energy(self):
-        m_pendulum = 1.0
-        l_pendulum = 1.0
         kinetic_energy = (
-            0.5 * m_pendulum * l_pendulum**2 * torch.square(self.dof_vel[:, 0])
+            0.5
+            * self.cfg.asset.mass
+            * self.cfg.asset.length**2
+            * torch.square(self.dof_vel[:, 0])
         )
         potential_energy = (
-            m_pendulum * 9.81 * l_pendulum * torch.cos(self.dof_pos[:, 0])
+            self.cfg.asset.mass
+            * 9.81
+            * self.cfg.asset.length
+            * torch.cos(self.dof_pos[:, 0])
         )
-        desired_energy = m_pendulum * 9.81 * l_pendulum
+        desired_energy = self.cfg.asset.mass * 9.81 * self.cfg.asset.length
         energy_error = kinetic_energy + potential_energy - desired_energy
         return self._sqrdexp(energy_error / desired_energy)
+
+    def _normalize_theta(self):
+        # normalize to range [-pi, pi]
+        theta = self.dof_pos[:, 0]
+        return ((theta + np.pi) % (2 * np.pi)) - np.pi
diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py
index 920356cd..399060a2 100644
--- a/gym/envs/pendulum/pendulum_config.py
+++ b/gym/envs/pendulum/pendulum_config.py
@@ -5,9 +5,9 @@
 
 class PendulumCfg(FixedRobotCfg):
     class env(FixedRobotCfg.env):
-        num_envs = 2**13
-        num_actuators = 1  # 1 for theta connecting base and pole
-        episode_length_s = 5.0
+        num_envs = 4096
+        num_actuators = 1
+        episode_length_s = 10
 
     class terrain(FixedRobotCfg.terrain):
         pass
@@ -18,12 +18,12 @@ class viewer:
         lookat = [0.0, 0.0, 0.0]  # [m]
 
     class init_state(FixedRobotCfg.init_state):
-        default_joint_angles = {"theta": 0.0}  # -torch.pi / 2.0}
+        default_joint_angles = {"theta": 0}  # -torch.pi / 2.0}
 
         # * default setup chooses how the initial conditions are chosen.
         # * "reset_to_basic" = a single position
         # * "reset_to_range" = uniformly random from a range defined below
-        reset_mode = "reset_to_range"
+        reset_mode = "reset_to_uniform"
 
         # * initial conditions for reset_to_range
         dof_pos_range = {
@@ -33,8 +33,8 @@ class init_state(FixedRobotCfg.init_state):
 
     class control(FixedRobotCfg.control):
         actuated_joints_mask = [1]  # angle
-        ctrl_frequency = 100
-        desired_sim_frequency = 200
+        ctrl_frequency = 10
+        desired_sim_frequency = 100
         stiffness = {"theta": 0.0}  # [N*m/rad]
         damping = {"theta": 0.0}  # [N*m*s/rad]
 
@@ -44,6 +44,8 @@ class asset(FixedRobotCfg.asset):
         disable_gravity = False
         disable_motors = False  # all torques set to 0
         joint_damping = 0.1
+        mass = 1.0
+        length = 1.0
 
     class reward_settings(FixedRobotCfg.reward_settings):
         tracking_sigma = 0.25
@@ -57,15 +59,18 @@ class scaling(FixedRobotCfg.scaling):
 
 class PendulumRunnerCfg(FixedRobotCfgPPO):
     seed = -1
-    runner_class_name = "DataLoggingRunner"
+    runner_class_name = "IPGRunner"
 
     class actor:
         hidden_dims = [128, 64, 32]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "tanh"
+        smooth_exploration = False
+        exploration_sample_freq = 16
 
+        normalize_obs = False
         obs = [
-            "dof_pos",
+            "dof_pos_obs",
             "dof_vel",
         ]
 
@@ -77,21 +82,22 @@ class noise:
             dof_vel = 0.0
 
     class critic:
-        obs = [
-            "dof_pos",
-            "dof_vel",
-        ]
         hidden_dims = [128, 64, 32]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "tanh"
-        standard_critic_nn = True
+
+        normalize_obs = False
+        obs = [
+            "dof_pos_obs",
+            "dof_vel",
+        ]
 
         class reward:
             class weights:
                 theta = 0.0
                 omega = 0.0
                 equilibrium = 1.0
-                energy = 0.0
+                energy = 0.5
                 dof_vel = 0.0
                 torques = 0.025
 
@@ -99,27 +105,36 @@ class termination_weight:
                 termination = 0.0
 
     class algorithm(FixedRobotCfgPPO.algorithm):
-        # training params
-        value_loss_coef = 1.0
-        use_clipped_value_loss = True
+        # both
+        gamma = 0.95
+        # discount_horizon = 2.0
+        lam = 0.98
+        # shared
+        max_gradient_steps = 24
+        # new
+        storage_size = 2**17  # new
+        batch_size = 2**16  #  new
         clip_param = 0.2
+        learning_rate = 1.0e-4
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
         entropy_coef = 0.01
-        num_learning_epochs = 6
-        # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 4
-        learning_rate = 1.0e-3
         schedule = "fixed"  # could be adaptive, fixed
-        discount_horizon = 2.0  # [s]
-        lam = 0.98
-        # GAE_bootstrap_horizon = .0  # [s]
         desired_kl = 0.01
-        max_grad_norm = 1.0
-        standard_loss = True
-        plus_c_penalty = 0.1
+
+        # IPG
+        polyak = 0.995
+        use_cv = False
+        inter_nu = 0.2
+        beta = "off_policy"
+        storage_size = 4 * 100 * 4096  # num_policies*num_steps*num_envs
 
     class runner(FixedRobotCfgPPO.runner):
         run_name = ""
         experiment_name = "pendulum"
-        max_iterations = 500  # number of policy updates
-        algorithm_class_name = "PPO2"
-        num_steps_per_env = 32
+        max_iterations = 100  # number of policy updates
+        algorithm_class_name = "PPO_IPG"
+        num_steps_per_env = 100
+        save_interval = 20
diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py
new file mode 100644
index 00000000..dcc913c9
--- /dev/null
+++ b/learning/modules/lqrc/plotting.py
@@ -0,0 +1,142 @@
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
+from matplotlib.colors import ListedColormap
+
+import numpy as np
+
+
+def create_custom_bwr_colormap():
+    # Define the colors for each segment
+    dark_blue = [0, 0, 0.5, 1]
+    light_blue = [0.5, 0.5, 1, 1]
+    white = [1, 1, 1, 1]
+    light_red = [1, 0.5, 0.5, 1]
+    dark_red = [0.5, 0, 0, 1]
+
+    # Number of bins for each segment
+    n_bins = 128
+    mid_band = 5
+
+    # Create the colormap segments
+    blue_segment = np.linspace(dark_blue, light_blue, n_bins // 2)
+    white_segment = np.tile(white, (mid_band, 1))
+    red_segment = np.linspace(light_red, dark_red, n_bins // 2)
+
+    # Stack segments to create the full colormap
+    colors = np.vstack((blue_segment, white_segment, red_segment))
+    custom_bwr = ListedColormap(colors, name="custom_bwr")
+
+    return custom_bwr
+
+
+def plot_pendulum_critics_with_data(
+    x,
+    predictions,
+    targets,
+    actions,
+    title,
+    fn,
+    data,
+    colorbar_label="f(x)",
+    grid_size=64,
+):
+    num_critics = len(x.keys())
+    fig, axes = plt.subplots(nrows=2, ncols=num_critics, figsize=(4 * num_critics, 6))
+
+    # Determine global min and max error for consistent scaling
+    global_min_error = float("inf")
+    global_max_error = float("-inf")
+    global_min_prediction = float("inf")
+    global_max_prediction = float("-inf")
+    prediction_cmap = mpl.cm.get_cmap("viridis")
+    action_cmap = mpl.cm.get_cmap("bwr")
+
+    for critic_name in x:
+        np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1)
+        np_targets = (
+            targets["Ground Truth MC Returns"].detach().cpu().numpy().reshape(-1)
+        )
+        np_error = np_predictions - np_targets
+        global_min_error = min(global_min_error, np.min(np_error))
+        global_max_error = max(global_max_error, np.max(np_error))
+        global_min_prediction = min(
+            global_min_prediction, np.min(np_predictions), np.min(np_targets)
+        )
+        global_max_prediction = max(
+            global_max_prediction, np.max(np_predictions), np.max(np_targets)
+        )
+    prediction_norm = mcolors.CenteredNorm(
+        vcenter=(global_max_prediction + global_min_prediction) / 2,
+        halfrange=(global_max_prediction - global_min_prediction) / 2,
+    )
+    action_norm = mcolors.CenteredNorm()
+
+    xcord = np.linspace(-2 * np.pi, 2 * np.pi, grid_size)
+    ycord = np.linspace(-5, 5, grid_size)
+
+    for idx, critic_name in enumerate(x):
+        np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1)
+        np_targets = (
+            targets["Ground Truth MC Returns"].detach().cpu().numpy().reshape(-1)
+        )
+        np_error = np_predictions - np_targets
+
+        # Plot Predictions
+        axes[0, idx].imshow(
+            np_predictions.reshape(grid_size, grid_size).T,
+            origin="lower",
+            extent=(xcord.min(), xcord.max(), ycord.min(), ycord.max()),
+            cmap=prediction_cmap,
+            norm=prediction_norm,
+        )
+        axes[0, idx].set_title(f"{critic_name} Prediction")
+
+    # Plot MC Trajectories
+    data = data.detach().cpu().numpy()
+    theta = data[:, :, 0]
+    omega = data[:, :, 1]
+    axes[1, 0].plot(theta, omega, lw=1)
+    axes[1, 0].set_xlabel("theta")
+    axes[1, 0].set_ylabel("theta_dot")
+    fig.suptitle(title, fontsize=16)
+
+    # Plot Actions
+    np_actions = actions.detach().cpu().numpy().reshape(-1)
+    axes[1, 1].imshow(
+        np_actions.reshape(grid_size, grid_size).T,
+        origin="lower",
+        extent=(xcord.min(), xcord.max(), ycord.min(), ycord.max()),
+        cmap=action_cmap,
+        norm=action_norm,
+    )
+    axes[1, 1].set_title("Actions")
+    ax1_mappable = mpl.cm.ScalarMappable(norm=action_norm, cmap=action_cmap)
+
+    # Last axis empty
+    axes[1, 2].axis("off")
+
+    # Ensure the axes are the same for all plots
+    for ax in axes.flat:
+        ax.set_xlim([xcord.min(), xcord.max()])
+        ax.set_ylim([ycord.min(), ycord.max()])
+
+    plt.subplots_adjust(
+        top=0.9, bottom=0.1, left=0.1, right=0.9, hspace=0.4, wspace=0.3
+    )
+
+    fig.colorbar(
+        mpl.cm.ScalarMappable(norm=prediction_norm, cmap=prediction_cmap),
+        ax=axes[0, :].ravel().tolist(),
+        shrink=0.95,
+        label=colorbar_label,
+    )
+    fig.colorbar(
+        ax1_mappable,
+        ax=axes[1, :].ravel().tolist(),
+        shrink=0.95,
+        label=colorbar_label,
+    )
+
+    plt.savefig(f"{fn}.png")
+    print(f"Saved to {fn}.png")
diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py
index bb8bb044..4b7b5dfe 100644
--- a/learning/runners/ipg_runner.py
+++ b/learning/runners/ipg_runner.py
@@ -59,7 +59,7 @@ def learn(self):
         actor_obs = self.get_obs(self.actor_cfg["obs"])
         critic_obs = self.get_obs(self.critic_cfg["obs"])
         tot_iter = self.it + self.num_learning_iterations
-        self.save()
+        # self.save()
 
         # * Initialize smooth exploration matrices
         if self.actor_cfg["smooth_exploration"]:
@@ -76,6 +76,8 @@ def learn(self):
                 "next_critic_obs": critic_obs,
                 "rewards": self.get_rewards({"termination": 0.0})["termination"],
                 "dones": self.get_timed_out(),
+                "dof_pos": self.env.dof_pos,
+                "dof_vel": self.env.dof_vel,
             }
         )
         storage_onpol.initialize(
@@ -144,6 +146,8 @@ def learn(self):
                             "rewards": total_rewards,
                             "timed_out": timed_out,
                             "dones": dones,
+                            "dof_pos": self.env.dof_pos,
+                            "dof_vel": self.env.dof_vel,
                         }
                     )
                     # add transition to both storages
@@ -157,18 +161,19 @@ def learn(self):
 
             logger.tic("learning")
             self.alg.update(storage_onpol.data, storage_offpol.get_data())
-            storage_onpol.clear()  # only clear on-policy storage
             logger.toc("learning")
-            logger.log_all_categories()
 
+            if self.it % self.save_interval == 0:
+                self.save()
+            storage_onpol.clear()  # only clear on-policy storage
+
+            logger.log_all_categories()
             logger.finish_iteration()
             logger.toc("iteration")
             logger.toc("runtime")
             logger.print_to_terminal()
 
-            if self.it % self.save_interval == 0:
-                self.save()
-        self.save()
+        self.save(end=True)
 
     @torch.no_grad
     def burn_in_normalization(self, n_iterations=100):
@@ -227,7 +232,7 @@ def set_up_logger(self):
             (self.alg.actor, self.alg.critic_v, self.alg.critic_q)
         )
 
-    def save(self):
+    def save(self, end=False):
         os.makedirs(self.log_dir, exist_ok=True)
         path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
         torch.save(
@@ -242,6 +247,13 @@ def save(self):
             },
             path,
         )
+        # Save data
+        path_offpol = os.path.join(self.log_dir, "data_offpol_{}".format(self.it))
+        torch.save(storage_offpol.get_data().cpu(), path_offpol + ".pt")
+        # When training ends the on-policy data was cleared
+        if not end:
+            path_onpol = os.path.join(self.log_dir, "data_onpol_{}".format(self.it))
+            torch.save(storage_onpol.data.cpu(), path_onpol + ".pt")
 
     def load(self, path, load_optimizer=True, load_actor_std=True):
         loaded_dict = torch.load(path)
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index c3465433..6823a6d4 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -30,7 +30,7 @@ def learn(self, states_to_log_dict=None):
         actor_obs = self.get_obs(self.actor_cfg["obs"])
         critic_obs = self.get_obs(self.critic_cfg["obs"])
         tot_iter = self.it + self.num_learning_iterations
-        self.save()
+        # self.save()
 
         # * Initialize smooth exploration matrices
         if self.actor_cfg["smooth_exploration"]:
@@ -47,6 +47,8 @@ def learn(self, states_to_log_dict=None):
                 "next_critic_obs": critic_obs,
                 "rewards": self.get_rewards({"termination": 0.0})["termination"],
                 "dones": self.get_timed_out(),
+                "dof_pos": self.env.dof_pos,
+                "dof_vel": self.env.dof_vel,
             }
         )
         storage.initialize(
@@ -116,6 +118,8 @@ def learn(self, states_to_log_dict=None):
                             "rewards": total_rewards,
                             "timed_out": timed_out,
                             "dones": dones,
+                            "dof_pos": self.env.dof_pos,
+                            "dof_vel": self.env.dof_vel,
                         }
                     )
                     storage.add_transitions(transition)
@@ -127,18 +131,19 @@ def learn(self, states_to_log_dict=None):
 
             logger.tic("learning")
             self.alg.update(storage.data)
-            storage.clear()
             logger.toc("learning")
-            logger.log_all_categories()
 
+            if self.it % self.save_interval == 0:
+                self.save()
+            storage.clear()
+
+            logger.log_all_categories()
             logger.finish_iteration()
             logger.toc("iteration")
             logger.toc("runtime")
             logger.print_to_terminal()
 
-            if self.it % self.save_interval == 0:
-                self.save()
-        self.save()
+        self.save(end=True)
 
     @torch.no_grad
     def burn_in_normalization(self, n_iterations=100):
@@ -185,7 +190,7 @@ def set_up_logger(self):
 
         logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 
-    def save(self):
+    def save(self, end=False):
         os.makedirs(self.log_dir, exist_ok=True)
         path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
         torch.save(
@@ -198,6 +203,10 @@ def save(self):
             },
             path,
         )
+        # Save data if training hasn't ended, otherwise storage was cleared
+        if not end:
+            path_data = os.path.join(self.log_dir, "data_{}".format(self.it))
+            torch.save(storage.data.cpu(), path_data + ".pt")
 
     def load(self, path, load_optimizer=True):
         loaded_dict = torch.load(path)
diff --git a/scripts/visualize_ipg.py b/scripts/visualize_ipg.py
new file mode 100644
index 00000000..993f4e0d
--- /dev/null
+++ b/scripts/visualize_ipg.py
@@ -0,0 +1,124 @@
+import os
+import shutil
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+
+from learning.modules.actor import Actor
+from learning.modules.critic import Critic
+
+from learning.utils import (
+    compute_generalized_advantages,
+    compute_MC_returns,
+)
+from learning.modules.lqrc.plotting import plot_pendulum_critics_with_data
+from gym import LEGGED_GYM_ROOT_DIR
+
+DEVICE = "cpu"
+
+# * Setup
+LOAD_RUN = "Jul17_17-22-37_IPG_nu1"
+TITLE = "IPG nu=1.0"
+IT_RANGE = range(20, 101, 20)
+
+RUN_DIR = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "pendulum", LOAD_RUN)
+PLOT_DIR = os.path.join(RUN_DIR, "visualize_critic")
+os.makedirs(PLOT_DIR, exist_ok=True)
+
+# * V-Critic, Q-Critic and Actor
+critic_q = Critic(
+    num_obs=4, hidden_dims=[128, 64, 32], activation="tanh", normalize_obs=False
+).to(DEVICE)
+critic_v = Critic(
+    num_obs=3, hidden_dims=[128, 64, 32], activation="tanh", normalize_obs=False
+).to(DEVICE)
+actor = Actor(
+    num_obs=3,
+    num_actions=1,
+    hidden_dims=[128, 64, 32],
+    activation="tanh",
+    normalize_obs=False,
+).to(DEVICE)
+
+# * Params
+n_envs = 4096  # that were trained with
+gamma = 0.95
+lam = 1.0
+episode_steps = 100
+visualize_steps = 10  # just to show rollouts
+n_trajs = 64
+rand_perm = torch.randperm(n_envs)
+traj_idx = rand_perm[0:n_trajs]
+test_idx = rand_perm[n_trajs : n_trajs + 1000]
+
+for it in IT_RANGE:
+    # load data
+    base_data = torch.load(os.path.join(RUN_DIR, "data_onpol{}.pt".format(it))).to(
+        DEVICE
+    )
+    data = base_data.detach().clone()
+
+    dof_pos = base_data["dof_pos"].detach().clone()
+    dof_vel = base_data["dof_vel"].detach().clone()
+    graphing_obs = torch.cat((dof_pos, dof_vel), dim=2)
+
+    # compute ground-truth
+    graphing_data = {
+        data_name: {} for data_name in ["obs", "values", "returns", "actions"]
+    }
+
+    episode_rollouts = compute_MC_returns(base_data, gamma)
+    print(f"Initializing value offset to: {episode_rollouts.mean().item()}")
+    graphing_data["obs"]["Ground Truth MC Returns"] = graphing_obs[0, :]
+    graphing_data["values"]["Ground Truth MC Returns"] = episode_rollouts[0, :]
+    graphing_data["returns"]["Ground Truth MC Returns"] = episode_rollouts[0, :]
+
+    # load models
+    model = torch.load(os.path.join(RUN_DIR, "model_{}.pt".format(it)))
+    critic_v.load_state_dict(model["critic_v_state_dict"])
+    critic_q.load_state_dict(model["critic_q_state_dict"])
+    actor.load_state_dict(model["actor_state_dict"])
+
+    # V-critic values and returns
+    data["values"] = critic_v.evaluate(data["critic_obs"])
+    data["advantages"] = compute_generalized_advantages(data, gamma, lam, critic_v)
+    data["returns"] = data["advantages"] + data["values"]
+
+    with torch.no_grad():
+        graphing_data["obs"]["V-Critic"] = graphing_obs[0, :]
+        graphing_data["values"]["V-Critic"] = critic_v.evaluate(
+            data[0, :]["critic_obs"]
+        )
+        graphing_data["returns"]["V-Critic"] = data[0, :]["returns"]
+        graphing_data["actions"] = actor(data[0, :]["actor_obs"])
+
+    # Q-critic values and returns
+    actions = actor(data["actor_obs"])
+    critic_q_obs = torch.cat((data["critic_obs"], actions), dim=2)
+    data["values"] = critic_q.evaluate(critic_q_obs)
+    data["next_critic_obs"] = critic_q_obs  # needed for GAE
+    data["advantages"] = compute_generalized_advantages(data, gamma, lam, critic_q)
+    data["returns"] = data["advantages"] + data["values"]
+
+    with torch.no_grad():
+        graphing_data["obs"]["Q-Critic"] = graphing_obs[0, :]
+        graphing_data["values"]["Q-Critic"] = critic_q.evaluate(critic_q_obs[0, :])
+        graphing_data["returns"]["Q-Critic"] = data[0, :]["returns"]
+
+    # generate plots
+    grid_size = int(np.sqrt(n_envs))
+    plot_pendulum_critics_with_data(
+        x=graphing_data["obs"],
+        predictions=graphing_data["values"],
+        targets=graphing_data["returns"],
+        actions=graphing_data["actions"],
+        title=f"{TITLE} Iteration {it}",
+        fn=PLOT_DIR + f"/IPG_it{it}",
+        data=graphing_obs[:visualize_steps, traj_idx],
+        grid_size=grid_size,
+    )
+
+    plt.close()
+
+this_file = os.path.join(LEGGED_GYM_ROOT_DIR, "scripts", "visualize_ipg.py")
+shutil.copy(this_file, os.path.join(PLOT_DIR, os.path.basename(this_file)))
diff --git a/scripts/visualize_ppo.py b/scripts/visualize_ppo.py
new file mode 100644
index 00000000..61cf123a
--- /dev/null
+++ b/scripts/visualize_ppo.py
@@ -0,0 +1,102 @@
+import os
+import shutil
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+
+from learning.modules.actor import Actor
+from learning.modules.critic import Critic
+
+from learning.utils import (
+    compute_generalized_advantages,
+    compute_MC_returns,
+)
+from learning.modules.lqrc.plotting import plot_pendulum_critics_with_data
+from gym import LEGGED_GYM_ROOT_DIR
+
+DEVICE = "cpu"
+
+# * Setup
+LOAD_RUN = "Jul17_15-39-53_PPO"
+TITLE = "PPO"
+IT_RANGE = range(20, 101, 20)
+
+RUN_DIR = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "pendulum", LOAD_RUN)
+PLOT_DIR = os.path.join(RUN_DIR, "visualize_critic")
+os.makedirs(PLOT_DIR, exist_ok=True)
+
+# * Critic and Actor
+critic = Critic(
+    num_obs=3, hidden_dims=[128, 64, 32], activation="tanh", normalize_obs=False
+).to(DEVICE)
+actor = Actor(
+    num_obs=3,
+    num_actions=1,
+    hidden_dims=[128, 64, 32],
+    activation="tanh",
+    normalize_obs=False,
+).to(DEVICE)
+
+# * Params
+n_envs = 4096  # that were trained with
+gamma = 0.95
+lam = 1.0
+visualize_steps = 10  # just to show rollouts
+n_trajs = 64
+rand_perm = torch.randperm(n_envs)
+traj_idx = rand_perm[0:n_trajs]
+test_idx = rand_perm[n_trajs : n_trajs + 1000]
+
+for it in IT_RANGE:
+    # load data
+    base_data = torch.load(os.path.join(RUN_DIR, "data_{}.pt".format(it))).to(DEVICE)
+    data = base_data.detach().clone()
+
+    dof_pos = base_data["dof_pos"].detach().clone()
+    dof_vel = base_data["dof_vel"].detach().clone()
+    graphing_obs = torch.cat((dof_pos, dof_vel), dim=2)
+
+    # compute ground-truth
+    graphing_data = {
+        data_name: {} for data_name in ["obs", "values", "returns", "actions"]
+    }
+
+    episode_rollouts = compute_MC_returns(base_data, gamma)
+    print(f"Initializing value offset to: {episode_rollouts.mean().item()}")
+    graphing_data["obs"]["Ground Truth MC Returns"] = graphing_obs[0, :]
+    graphing_data["values"]["Ground Truth MC Returns"] = episode_rollouts[0, :]
+    graphing_data["returns"]["Ground Truth MC Returns"] = episode_rollouts[0, :]
+
+    # load models
+    model = torch.load(os.path.join(RUN_DIR, "model_{}.pt".format(it)))
+    critic.load_state_dict(model["critic_state_dict"])
+    actor.load_state_dict(model["actor_state_dict"])
+
+    # compute values and returns
+    data["values"] = critic.evaluate(data["critic_obs"])
+    data["advantages"] = compute_generalized_advantages(data, gamma, lam, critic)
+    data["returns"] = data["advantages"] + data["values"]
+
+    with torch.no_grad():
+        graphing_data["obs"]["V-Critic"] = graphing_obs[0, :]
+        graphing_data["values"]["V-Critic"] = critic.evaluate(data[0, :]["critic_obs"])
+        graphing_data["returns"]["V-Critic"] = data[0, :]["returns"]
+        graphing_data["actions"]["V-Critic"] = actor(data[0, :]["actor_obs"])
+
+    # generate plots
+    grid_size = int(np.sqrt(n_envs))
+    plot_pendulum_critics_with_data(
+        x=graphing_data["obs"],
+        predictions=graphing_data["values"],
+        targets=graphing_data["returns"],
+        actions=graphing_data["actions"],
+        title=f"{TITLE} Iteration {it}",
+        fn=PLOT_DIR + f"/PPO_it{it}",
+        data=graphing_obs[:visualize_steps, traj_idx],
+        grid_size=grid_size,
+    )
+
+    plt.close()
+
+this_file = os.path.join(LEGGED_GYM_ROOT_DIR, "scripts", "visualize_ppo.py")
+shutil.copy(this_file, os.path.join(PLOT_DIR, os.path.basename(this_file)))

From 43dec750a5ea6d92f702eb9543e842566a8ff2ea Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 19 Jul 2024 17:05:24 -0400
Subject: [PATCH 37/42] finetune IPG with sim on-policy and hardware
 off-policy, rewards increase!

---
 .../mini_cheetah_finetune_config.py           | 27 +++---
 gym/envs/mini_cheetah/minimalist_cheetah.py   |  2 +-
 learning/runners/finetune_runner.py           | 44 ++++-----
 scripts/eval_rewards.py                       | 89 +++++++++++++++----
 scripts/finetune.py                           | 77 ++++++++++------
 5 files changed, 156 insertions(+), 83 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
index 284fcd0a..32db3fb2 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
@@ -8,7 +8,7 @@
 
 class MiniCheetahFineTuneCfg(MiniCheetahRefCfg):
     class env(MiniCheetahRefCfg.env):
-        num_envs = 1
+        num_envs = 1024
         num_actuators = 12
         episode_length_s = 30.0
 
@@ -23,7 +23,7 @@ class control(MiniCheetahRefCfg.control):
         stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0}
         damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5}
         gait_freq = 3.0
-        ctrl_frequency = 50
+        ctrl_frequency = 100
         desired_sim_frequency = 500
 
     class commands(MiniCheetahRefCfg.commands):
@@ -42,7 +42,7 @@ class push_robots(MiniCheetahRefCfg.push_robots):
     class domain_rand(MiniCheetahRefCfg.domain_rand):
         randomize_friction = False
         friction_range = [0.5, 1.0]
-        randomize_base_mass = True
+        randomize_base_mass = False
         added_mass_range = [5.0, 5.0]
 
     class asset(MiniCheetahRefCfg.asset):
@@ -108,15 +108,15 @@ class critic:
 
         class reward:
             class weights:
-                tracking_lin_vel = 2.0
-                tracking_ang_vel = 1.0
+                tracking_lin_vel = 4.0
+                tracking_ang_vel = 2.0
                 orientation = 1.0
                 min_base_height = 1.5
-                action_rate = 0.001
+                stand_still = 2.0
+                swing_grf = 3.0
+                stance_grf = 3.0
+                action_rate = 0.01
                 action_rate2 = 0.001
-                stand_still = 1.0
-                swing_grf = 1.0
-                stance_grf = 1.0
 
             class termination_weight:
                 termination = 0.15
@@ -129,18 +129,21 @@ class algorithm(MiniCheetahRefRunnerCfg.algorithm):
         use_cv = False
         inter_nu = 0.5
         beta = "off_policy"
-        storage_size = 4 * 32 * 4096  # num_policies*num_stpes*num_envs
+        storage_size = 4 * 32 * 4096  # num_policies*num_steps*num_envs
 
         # Finetuning
+        clip_param = 0.2
+        max_gradient_steps = 4
+        batch_size = 30000
         learning_rate = 1e-4
-        max_gradient_steps = 5
+        schedule = "fixed"
 
     class runner(MiniCheetahRefRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
         max_iterations = 20  # number of policy updates
         algorithm_class_name = "PPO_IPG"
-        num_steps_per_env = 3000
+        num_steps_per_env = 200
 
         # Finetuning
         resume = True
diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py
index bb67fb69..1a8af6bf 100644
--- a/gym/envs/mini_cheetah/minimalist_cheetah.py
+++ b/gym/envs/mini_cheetah/minimalist_cheetah.py
@@ -85,7 +85,7 @@ def _reward_tracking_lin_vel(self):
         # * scale by (1+|cmd|): if cmd=0, no scaling.
         error *= 1.0 / (1.0 + torch.abs(self.commands[:, :2]))
         error = torch.sum(torch.square(error), dim=1)
-        return torch.exp(-error / self.tracking_sigma)
+        return torch.exp(-error / self.tracking_sigma) * (1 - self._switch())
 
     def _reward_tracking_ang_vel(self):
         """Tracking of angular velocity commands (yaw)"""
diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py
index 064073b3..127d95d4 100644
--- a/learning/runners/finetune_runner.py
+++ b/learning/runners/finetune_runner.py
@@ -20,17 +20,16 @@ def __init__(
         exploration_scale=1.0,
         device="cpu",
     ):
-        self.env = env
-        self.parse_train_cfg(train_cfg)
         self.data_onpol = data_onpol
         self.data_offpol = data_offpol
         self.exploration_scale = exploration_scale
-        self.device = device
+        # Init calls _set_up_alg which needs exploration scale
+        super().__init__(env, train_cfg, device)
 
     def _set_up_alg(self):
-        num_actor_obs = self.data_onpol["actor_obs"].shape[-1]
-        num_actions = self.data_onpol["actions"].shape[-1]
-        num_critic_obs = self.data_onpol["critic_obs"].shape[-1]
+        num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
+        num_actions = self.get_action_size(self.actor_cfg["actions"])
+        num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
         if self.actor_cfg["smooth_exploration"]:
             actor = SmoothActor(
                 num_obs=num_actor_obs,
@@ -41,10 +40,8 @@ def _set_up_alg(self):
         else:
             actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
 
-        alg_class_name = self.cfg["algorithm_class_name"]
-        alg_class = eval(alg_class_name)
-
-        if alg_class_name == "PPO_IPG":
+        alg_class = eval(self.cfg["algorithm_class_name"])
+        if self.data_offpol is not None:
             critic_v = Critic(num_critic_obs, **self.critic_cfg)
             critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
             target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
@@ -71,16 +68,11 @@ def learn(self):
         self.alg.switch_to_train()
 
         if self.env is not None:
-            # Simulate 1 env, same number of steps as in data
-            num_steps = self.data_onpol.shape[0]
-            sim_data = self.get_sim_data(num_steps)
-            # Concatenate data dict wtih sim data
+            # Simulate on-policy data
             self.data_onpol = TensorDict(
-                {
-                    name: torch.cat((self.data_onpol[name], sim_data[name]), dim=1)
-                    for name in self.data_onpol.keys()
-                },
-                batch_size=(num_steps, 2),
+                self.get_sim_data(),
+                batch_size=(self.num_steps_per_env, self.env.num_envs),
+                device=self.device,
             )
 
         # Single alg update on data
@@ -89,17 +81,17 @@ def learn(self):
         else:
             self.alg.update(self.data_onpol, self.data_offpol)
 
-    def get_sim_data(self, num_steps):
+    def get_sim_data(self):
         rewards_dict = {}
         actor_obs = self.get_obs(self.actor_cfg["obs"])
         critic_obs = self.get_obs(self.critic_cfg["obs"])
 
         # * Initialize smooth exploration matrices
         if self.actor_cfg["smooth_exploration"]:
-            self.alg.actor.sample_weights(batch_size=1)
+            self.alg.actor.sample_weights(batch_size=self.env.num_envs)
 
         # * Start up storage
-        transition = TensorDict({}, batch_size=1, device=self.device)
+        transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
         transition.update(
             {
                 "actor_obs": actor_obs,
@@ -113,18 +105,18 @@ def get_sim_data(self, num_steps):
         )
         sim_storage.initialize(
             transition,
-            num_envs=1,
-            max_storage=num_steps,
+            self.env.num_envs,
+            self.env.num_envs * self.num_steps_per_env,
             device=self.device,
         )
 
         # * Rollout
         with torch.inference_mode():
-            for i in range(num_steps):
+            for i in range(self.num_steps_per_env):
                 # * Re-sample noise matrix for smooth exploration
                 sample_freq = self.actor_cfg["exploration_sample_freq"]
                 if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
-                    self.alg.actor.sample_weights(batch_size=1)
+                    self.alg.actor.sample_weights(batch_size=self.env.num_envs)
 
                 actions = self.alg.act(actor_obs)
                 self.set_actions(
diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py
index 1e4d6c50..0cf3ec68 100644
--- a/scripts/eval_rewards.py
+++ b/scripts/eval_rewards.py
@@ -10,11 +10,11 @@
 import matplotlib.pyplot as plt
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
-LOAD_RUN = "Jul13_01-49-59_PPO32_S16"
+LOAD_RUN = "Jul12_15-53-57_IPG32_S16"
 REWARDS_FILE = "rewards.csv"
-ITERATIONS = range(700, 705)
 
-PLOT = True
+DATAFRAME_LOADED = True
+PLOT = False
 PLOT_N = 1000  # number of steps to plot
 
 # Data struct fields from Robot-Software logs
@@ -36,14 +36,14 @@
 ]
 
 REWARD_WEIGHTS = {
+    "tracking_lin_vel": 4.0,
+    "tracking_ang_vel": 2.0,
     "min_base_height": 1.5,
-    # "tracking_lin_vel": 4.0,
-    # "tracking_ang_vel": 2.0,
     "orientation": 1.0,
-    "swing_grf": 1.0,
-    "stance_grf": 1.0,
-    "stand_still": 1.0,
-    "action_rate": 0.001,
+    "stand_still": 2.0,
+    "swing_grf": 3.0,
+    "stance_grf": 3.0,
+    "action_rate": 0.01,
     "action_rate2": 0.001,
 }
 
@@ -110,21 +110,78 @@ def get_rewards(it, data_struct, rewards_path):
 
 def setup(name="SMOOTH_RL_CONTROLLER"):
     data_dict = {}
-    for it in ITERATIONS:
-        path = os.path.join(ROOT_DIR, LOAD_RUN, f"{it}.mat")
+    log_files = [
+        file
+        for file in os.listdir(os.path.join(ROOT_DIR, LOAD_RUN))
+        if file.endswith(".mat")
+    ]
+    log_files = sorted(log_files)
+    for file in log_files:
+        iteration = int(file.split(".")[0])
+        path = os.path.join(ROOT_DIR, LOAD_RUN, file)
         data = scipy.io.loadmat(path)
-        data_dict[it] = data[name][0][0]
+        data_dict[iteration] = data[name][0][0]
     return data_dict
 
 
 if __name__ == "__main__":
     data_dict = setup()
     rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
-    if os.path.exists(rewards_path):
-        os.remove(rewards_path)
 
-    for it, data_struct in data_dict.items():
-        get_rewards(it, data_struct, rewards_path)
+    if not DATAFRAME_LOADED:
+        # Compute rewards and store in dataframe
+        if os.path.exists(rewards_path):
+            os.remove(rewards_path)
+        for it, data_struct in data_dict.items():
+            get_rewards(it, data_struct, rewards_path)
 
     rewards_df = pd.read_csv(rewards_path)
     print(rewards_df)
+
+    # Plot rewards stats
+    num_plots = len(REWARD_WEIGHTS) + 1  # +1 for total rewards
+    cols = 5
+    rows = np.ceil(num_plots / cols).astype(int)
+    fig, axs = plt.subplots(rows, cols, figsize=(15, 6))
+    fig.suptitle("IPG Finetuning Rewards")
+    axs = axs.flatten()
+
+    for i, key in enumerate(REWARD_WEIGHTS.keys()):
+        rewards_mean = rewards_df[rewards_df["type"] == key]["mean"].reset_index(
+            drop=True
+        )
+        rewards_std = rewards_df[rewards_df["type"] == key]["std"].reset_index(
+            drop=True
+        )
+        axs[i].plot(rewards_mean, label=key)
+        axs[i].fill_between(
+            range(len(rewards_mean)),
+            rewards_mean - rewards_std,
+            rewards_mean + rewards_std,
+            color="b",
+            alpha=0.2,
+        )
+        axs[i].legend()
+        axs[i].set_xlabel("Iter")
+
+    i = num_plots - 1
+    total_mean = rewards_df[rewards_df["type"] == "total"]["mean"].reset_index(
+        drop=True
+    )
+    total_std = rewards_df[rewards_df["type"] == "total"]["std"].reset_index(drop=True)
+    axs[i].plot(total_mean, label="total", color="r")
+    axs[i].fill_between(
+        range(len(total_mean)),
+        total_mean - total_std,
+        total_mean + total_std,
+        color="r",
+        alpha=0.2,
+    )
+    axs[i].legend()
+    axs[i].set_xlabel("Iter")
+
+    for i in range(num_plots, len(axs)):
+        axs[i].axis("off")
+
+    plt.tight_layout()
+    plt.show()
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 51f165cf..ca7d404a 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -15,8 +15,10 @@
 from tensordict import TensorDict
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
+OUTPUT_FILE = "output.txt"
 
-USE_SIMULATOR = False
+USE_SIMULATOR = True
+DATA_LENGTH = 2990  # Robot-Software logs must contain at least this many steps
 
 # Scales
 EXPLORATION_SCALE = 0.8  # used during data collection
@@ -44,22 +46,23 @@
 DEVICE = "cuda"
 
 
-def get_obs(obs_list, data_struct, data_length):
+def get_obs(obs_list, data_struct):
     obs_all = torch.empty(0).to(DEVICE)
     for obs_name in obs_list:
         data_idx = DATA_LIST.index(obs_name)
         obs = torch.tensor(data_struct[data_idx]).to(DEVICE)
-        obs = obs.reshape((data_length, 1, -1))  # shape (data_length, 1, n)
+        obs = obs.squeeze()[:DATA_LENGTH]
+        obs = obs.reshape((DATA_LENGTH, 1, -1))  # shape (data_length, 1, n)
         obs_all = torch.cat((obs_all, obs), dim=-1)
 
     return obs_all.float()
 
 
-def get_rewards(data_struct, reward_weights, data_length):
+def get_rewards(data_struct, reward_weights, dt):
     minimalist_cheetah = MinimalistCheetah(device=DEVICE)
     rewards_all = torch.empty(0).to(DEVICE)
 
-    for i in range(data_length):
+    for i in range(DATA_LENGTH):
         minimalist_cheetah.set_states(
             base_height=data_struct[1][:, i],  # shape (1, data_length)
             base_lin_vel=data_struct[2][i],
@@ -75,7 +78,7 @@ def get_rewards(data_struct, reward_weights, data_length):
         total_rewards = 0
         for name, weight in reward_weights.items():
             reward = weight * eval(f"minimalist_cheetah._reward_{name}()")
-            total_rewards += reward
+            total_rewards += dt * reward  # scaled
         rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
         # Post process mini cheetah
         minimalist_cheetah.post_process()
@@ -83,23 +86,26 @@ def get_rewards(data_struct, reward_weights, data_length):
     return rewards_all.float()
 
 
-def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False):
+def get_data_dict(train_cfg, env_cfg, offpol=False, name="SMOOTH_RL_CONTROLLER"):
     load_run = train_cfg["runner"]["load_run"]
     checkpoint = train_cfg["runner"]["checkpoint"]
     run_dir = os.path.join(ROOT_DIR, load_run)
 
     if offpol:
-        log_files = [file for file in os.listdir(run_dir) if file.endswith(".mat")]
+        # All files up until checkpoint
+        log_files = [
+            file
+            for file in os.listdir(run_dir)
+            if file.endswith(".mat") and int(file.split(".")[0]) <= checkpoint
+        ]
         log_files = sorted(log_files)
     else:
-        # Single log file
+        # Single log file for checkpoint
         log_files = [str(checkpoint) + ".mat"]
 
     # Initialize data dict
     data = scipy.io.loadmat(os.path.join(run_dir, log_files[0]))
-    data_struct = data[name][0][0]
-    data_length = data_struct[1].shape[1]  # base_height: shape (1, data_length)
-    batch_size = (data_length - 1, len(log_files))  # -1 for next_obs
+    batch_size = (DATA_LENGTH - 1, len(log_files))  # -1 for next_obs
     data_dict = TensorDict({}, device=DEVICE, batch_size=batch_size)
 
     # Get all data
@@ -111,19 +117,22 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False):
         data = scipy.io.loadmat(os.path.join(run_dir, log))
         data_struct = data[name][0][0]
 
-        actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, data_length)
-        critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, data_length)
+        actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct)
+        critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct)
         actor_obs_all = torch.cat((actor_obs_all, actor_obs), dim=1)
         critic_obs_all = torch.cat((critic_obs_all, critic_obs), dim=1)
 
         actions_idx = DATA_LIST.index("dof_pos_target")
         actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float()
-        actions = actions.reshape((data_length, 1, -1))  # shape (data_length, 1, n)
+        actions = actions[:DATA_LENGTH]
+        actions = actions.reshape((DATA_LENGTH, 1, -1))  # shape (data_length, 1, n)
         actions_all = torch.cat((actions_all, actions), dim=1)
 
         reward_weights = train_cfg["critic"]["reward"]["weights"]
-        rewards = get_rewards(data_struct, reward_weights, data_length)
-        rewards = rewards.reshape((data_length, 1))  # shape (data_length, 1)
+        dt = 1.0 / env_cfg["control"]["ctrl_frequency"]
+        rewards = get_rewards(data_struct, reward_weights, dt)
+        rewards = rewards[:DATA_LENGTH]
+        rewards = rewards.reshape((DATA_LENGTH, 1))  # shape (data_length, 1)
         rewards_all = torch.cat((rewards_all, rewards), dim=1)
 
     data_dict["actor_obs"] = actor_obs_all[:-1]
@@ -152,16 +161,14 @@ def setup():
         env = None
 
     train_cfg = class_to_dict(train_cfg)
-    data_onpol = get_data_dict(train_cfg, offpol=False)
+    env_cfg = class_to_dict(env_cfg)
+    data_onpol = get_data_dict(train_cfg, env_cfg, offpol=False)
 
     if train_cfg["runner"]["algorithm_class_name"] == "PPO_IPG":
-        data_offpol = get_data_dict(train_cfg, offpol=True)
+        data_offpol = get_data_dict(train_cfg, env_cfg, offpol=True)
     else:
         data_offpol = None
 
-    print(data_onpol)
-    print(data_offpol)
-
     runner = FineTuneRunner(
         env,
         train_cfg,
@@ -170,7 +177,6 @@ def setup():
         exploration_scale=EXPLORATION_SCALE,
         device=DEVICE,
     )
-    runner._set_up_alg()
 
     return runner
 
@@ -179,7 +185,6 @@ def finetune(runner):
     load_run = runner.cfg["load_run"]
     checkpoint = runner.cfg["checkpoint"]
     load_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt")
-    print("Loading model from: ", load_path)
     runner.load(load_path)
 
     # Perform a single update
@@ -192,15 +197,31 @@ def finetune(runner):
         runner.data_onpol["actor_obs"]
     )
     diff = actions_new - actions_old
-    print("Mean action diff per actuator: ", diff.mean(dim=(0, 1)))
-    print("Overall mean action diff: ", diff.mean())
 
+    # Save and export
     save_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint + 1) + ".pt")
-    runner.save(save_path)
-
     export_path = os.path.join(ROOT_DIR, load_run, "exported_" + str(checkpoint + 1))
+    runner.save(save_path)
     runner.export(export_path)
 
+    # Print to output file
+    with open(os.path.join(ROOT_DIR, load_run, OUTPUT_FILE), "a") as f:
+        f.write(f"############ Checkpoint: {checkpoint} #######################\n")
+        f.write("############### DATA ###############\n")
+        f.write(f"Data on-policy shape: {runner.data_onpol.shape}\n")
+        if runner.data_offpol is not None:
+            f.write(f"Data off-policy shape: {runner.data_offpol.shape}\n")
+        f.write("############## LOSSES ##############\n")
+        f.write(f"Mean Value Loss: {runner.alg.mean_value_loss}\n")
+        f.write(f"Mean Surrogate Loss: {runner.alg.mean_surrogate_loss}\n")
+        if runner.data_offpol is not None:
+            f.write(f"Mean Q Loss: {runner.alg.mean_q_loss}\n")
+            f.write(f"Mean Offpol Loss: {runner.alg.mean_offpol_loss}\n")
+        f.write("############## ACTIONS #############\n")
+        f.write(f"Mean action diff per actuator: {diff.mean(dim=(0, 1))}\n")
+        f.write(f"Std action diff per actuator: {diff.std(dim=(0, 1))}\n")
+        f.write(f"Overall mean action diff: {diff.mean()}\n")
+
 
 if __name__ == "__main__":
     runner = setup()

From b004dbbbe0c2b71f3276d9668d8558325daff4e0 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 19 Jul 2024 19:12:18 -0400
Subject: [PATCH 38/42] update reward plots

---
 scripts/eval_rewards.py | 77 ++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py
index 0cf3ec68..0e0de257 100644
--- a/scripts/eval_rewards.py
+++ b/scripts/eval_rewards.py
@@ -11,11 +11,13 @@
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
 LOAD_RUN = "Jul12_15-53-57_IPG32_S16"
-REWARDS_FILE = "rewards.csv"
 
-DATAFRAME_LOADED = True
-PLOT = False
-PLOT_N = 1000  # number of steps to plot
+REWARDS_FILE = "rewards_nu1.csv"  # generate this file from logs, if None: just plot
+PLOT_REWARDS = {
+    "Nu=0.5": "rewards_nu05.csv",
+    "Nu=0.8": "rewards_nu08.csv",
+    "Nu=1.0": "rewards_nu1.csv",
+}
 
 # Data struct fields from Robot-Software logs
 DATA_LIST = [
@@ -99,16 +101,10 @@ def get_rewards(it, data_struct, rewards_path):
             },
             ignore_index=True,
         )
-        if PLOT:
-            plt.figure(it)
-            plt.plot(rewards[:PLOT_N], label=name)
-            plt.title(f"Rewards Iteration {it}")
-            plt.legend()
-            plt.savefig(f"{ROOT_DIR}/{LOAD_RUN}/rewards_{it}.png")
     rewards_df.to_csv(rewards_path, index=False)
 
 
-def setup(name="SMOOTH_RL_CONTROLLER"):
+def get_data_dict(name="SMOOTH_RL_CONTROLLER"):
     data_dict = {}
     log_files = [
         file
@@ -124,28 +120,7 @@ def setup(name="SMOOTH_RL_CONTROLLER"):
     return data_dict
 
 
-if __name__ == "__main__":
-    data_dict = setup()
-    rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
-
-    if not DATAFRAME_LOADED:
-        # Compute rewards and store in dataframe
-        if os.path.exists(rewards_path):
-            os.remove(rewards_path)
-        for it, data_struct in data_dict.items():
-            get_rewards(it, data_struct, rewards_path)
-
-    rewards_df = pd.read_csv(rewards_path)
-    print(rewards_df)
-
-    # Plot rewards stats
-    num_plots = len(REWARD_WEIGHTS) + 1  # +1 for total rewards
-    cols = 5
-    rows = np.ceil(num_plots / cols).astype(int)
-    fig, axs = plt.subplots(rows, cols, figsize=(15, 6))
-    fig.suptitle("IPG Finetuning Rewards")
-    axs = axs.flatten()
-
+def plot_rewards(rewards_df, axs, name):
     for i, key in enumerate(REWARD_WEIGHTS.keys()):
         rewards_mean = rewards_df[rewards_df["type"] == key]["mean"].reset_index(
             drop=True
@@ -153,35 +128,59 @@ def setup(name="SMOOTH_RL_CONTROLLER"):
         rewards_std = rewards_df[rewards_df["type"] == key]["std"].reset_index(
             drop=True
         )
-        axs[i].plot(rewards_mean, label=key)
+        axs[i].plot(rewards_mean, label=name)
         axs[i].fill_between(
             range(len(rewards_mean)),
             rewards_mean - rewards_std,
             rewards_mean + rewards_std,
-            color="b",
             alpha=0.2,
         )
+        axs[i].set_title(key)
         axs[i].legend()
-        axs[i].set_xlabel("Iter")
 
     i = num_plots - 1
     total_mean = rewards_df[rewards_df["type"] == "total"]["mean"].reset_index(
         drop=True
     )
     total_std = rewards_df[rewards_df["type"] == "total"]["std"].reset_index(drop=True)
-    axs[i].plot(total_mean, label="total", color="r")
+    axs[i].plot(total_mean, label=name)
     axs[i].fill_between(
         range(len(total_mean)),
         total_mean - total_std,
         total_mean + total_std,
-        color="r",
         alpha=0.2,
     )
+    axs[i].set_title("Total Rewards")
     axs[i].legend()
-    axs[i].set_xlabel("Iter")
 
+
+if __name__ == "__main__":
+    if REWARDS_FILE is not None:
+        data_dict = get_data_dict()
+        rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
+        if os.path.exists(rewards_path):
+            os.remove(rewards_path)
+        for it, data_struct in data_dict.items():
+            get_rewards(it, data_struct, rewards_path)
+
+    # Plot rewards stats
+    num_plots = len(REWARD_WEIGHTS) + 1  # +1 for total rewards
+    cols = 5
+    rows = np.ceil(num_plots / cols).astype(int)
+    fig, axs = plt.subplots(rows, cols, figsize=(30, 10))
+    fig.suptitle("IPG Finetuning Rewards")
+    axs = axs.flatten()
+
+    for name, file in PLOT_REWARDS.items():
+        path = os.path.join(ROOT_DIR, LOAD_RUN, file)
+        rewards_df = pd.read_csv(path)
+        plot_rewards(rewards_df, axs, name)
+
+    for i in range(num_plots):
+        axs[i].set_xlabel("Iter")
     for i in range(num_plots, len(axs)):
         axs[i].axis("off")
 
     plt.tight_layout()
+    plt.savefig(f"{ROOT_DIR}/{LOAD_RUN}/rewards_stats.png")
     plt.show()

From 36c1c8d0fdadda39f16f0334efb5cc968c88e2ef Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Tue, 23 Jul 2024 10:31:51 -0400
Subject: [PATCH 39/42] integrate SE for finetuning, refactor runner

---
 .../mini_cheetah_finetune_config.py           |  49 +++--
 gym/envs/mini_cheetah/mini_cheetah_ref.py     |  17 +-
 .../mini_cheetah/mini_cheetah_ref_config.py   |  23 +--
 gym/envs/mini_cheetah/minimalist_cheetah.py   |  27 ++-
 learning/algorithms/SE.py                     |  47 +++++
 learning/algorithms/ppo_ipg.py                |   6 +-
 learning/modules/state_estimator.py           |  14 +-
 learning/runners/finetune_runner.py           | 186 ++++++++++++++++--
 learning/runners/ipg_runner.py                |  12 +-
 learning/runners/on_policy_runner.py          |   7 +-
 scripts/eval_rewards.py                       | 138 +++++++------
 scripts/finetune.py                           | 139 ++-----------
 scripts/finetune_multiple.sh                  |  15 +-
 scripts/generate_commands.py                  |  13 +-
 14 files changed, 405 insertions(+), 288 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
index 32db3fb2..f4fd30a8 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
@@ -8,7 +8,7 @@
 
 class MiniCheetahFineTuneCfg(MiniCheetahRefCfg):
     class env(MiniCheetahRefCfg.env):
-        num_envs = 1024
+        num_envs = 4096
         num_actuators = 12
         episode_length_s = 30.0
 
@@ -27,23 +27,13 @@ class control(MiniCheetahRefCfg.control):
         desired_sim_frequency = 500
 
     class commands(MiniCheetahRefCfg.commands):
-        # * time before command are changed[s]
-        resampling_time = 3.0
-
-        # * smaller ranges for finetuning
-        class ranges:
-            lin_vel_x = [-1.0, 1.5]  # min max [m/s]
-            lin_vel_y = 0.5  # max [m/s]
-            yaw_vel = 1.5  # max [rad/s]
+        pass
 
     class push_robots(MiniCheetahRefCfg.push_robots):
         pass
 
     class domain_rand(MiniCheetahRefCfg.domain_rand):
-        randomize_friction = False
-        friction_range = [0.5, 1.0]
-        randomize_base_mass = False
-        added_mass_range = [5.0, 5.0]
+        pass
 
     class asset(MiniCheetahRefCfg.asset):
         pass
@@ -93,7 +83,9 @@ class critic:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-        normalize_obs = True
+
+        # TODO: Check normalization, SAC/IPG need gradient to pass back through actor
+        normalize_obs = False
         obs = [
             "base_height",
             "base_lin_vel",
@@ -121,15 +113,32 @@ class weights:
             class termination_weight:
                 termination = 0.15
 
+    class state_estimator:
+        class network:
+            hidden_dims = [128, 128]
+            activation = "tanh"
+            dropouts = None
+
+        obs = [
+            "base_ang_vel",
+            "projected_gravity",
+            "dof_pos_obs",
+            "dof_vel",
+            "torques",
+            "phase_obs",
+        ]
+        targets = ["base_height", "base_lin_vel", "grf"]
+        normalize_obs = True
+
     class algorithm(MiniCheetahRefRunnerCfg.algorithm):
         desired_kl = 0.02  # 0.02 for smooth-exploration, else 0.01
 
         # IPG
         polyak = 0.995
         use_cv = False
-        inter_nu = 0.5
+        inter_nu = 0.9
         beta = "off_policy"
-        storage_size = 4 * 32 * 4096  # num_policies*num_steps*num_envs
+        storage_size = 30000
 
         # Finetuning
         clip_param = 0.2
@@ -143,10 +152,10 @@ class runner(MiniCheetahRefRunnerCfg.runner):
         experiment_name = "mini_cheetah_ref"
         max_iterations = 20  # number of policy updates
         algorithm_class_name = "PPO_IPG"
-        num_steps_per_env = 200
+        num_steps_per_env = 32
 
         # Finetuning
         resume = True
-        load_run = "Jul12_15-53-57_IPG32_S16"
-        checkpoint = 700
-        save_interval = 5
+        load_run = "Jul23_00-14-23_nu02_B8"
+        checkpoint = 1000
+        save_interval = 1
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py
index e620a3c2..dbee12c0 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py
@@ -23,6 +23,7 @@ def _init_buffers(self):
         self.phase_obs = torch.zeros(
             self.num_envs, 2, dtype=torch.float, device=self.device
         )
+        self.grf = self._compute_grf()
 
     def _reset_system(self, env_ids):
         super()._reset_system(env_ids)
@@ -41,18 +42,19 @@ def _post_decimation_step(self):
         self.phase_obs = torch.cat(
             (torch.sin(self.phase), torch.cos(self.phase)), dim=1
         )
+        self.grf = self._compute_grf()
 
     def _resample_commands(self, env_ids):
         super()._resample_commands(env_ids)
-        # * with 10% chance, reset to 0 commands except for forward
+        # * with 20% chance, reset to 0 commands except for forward
         self.commands[env_ids, 1:] *= (
             torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
-            < 0.9
+            < 0.8
         ).unsqueeze(1)
-        # * with 10% chance, reset to 0 commands except for rotation
+        # * with 20% chance, reset to 0 commands except for rotation
         self.commands[env_ids, :2] *= (
             torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
-            < 0.9
+            < 0.8
         ).unsqueeze(1)
         # * with 10% chance, reset to 0
         self.commands[env_ids, :] *= (
@@ -66,6 +68,13 @@ def _switch(self):
             -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1
         )
 
+    def _compute_grf(self, grf_norm=True):
+        grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
+        if grf_norm:
+            return torch.clamp_max(grf / 80.0, 1.0)
+        else:
+            return grf
+
     def _reward_swing_grf(self):
         """Reward non-zero grf during swing (0 to pi)"""
         in_contact = torch.gt(
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 153e54ca..34c20416 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -67,7 +67,7 @@ class scaling(MiniCheetahCfg.scaling):
 
 class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg):
     seed = -1
-    runner_class_name = "OnPolicyRunner"
+    runner_class_name = "IPGRunner"
 
     class actor:
         hidden_dims = [256, 256, 128]
@@ -103,7 +103,9 @@ class critic:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-        normalize_obs = True
+
+        # TODO: Check normalization, SAC/IPG need gradient to pass back through actor
+        normalize_obs = False
         obs = [
             "base_height",
             "base_lin_vel",
@@ -121,15 +123,15 @@ class weights:
                 tracking_lin_vel = 4.0
                 tracking_ang_vel = 2.0
                 lin_vel_z = 0.0
-                ang_vel_xy = 0.01
+                ang_vel_xy = 0.0
                 orientation = 1.0
-                torques = 5.0e-7
+                torques = 0.0
                 dof_vel = 0.0
                 min_base_height = 1.5
                 collision = 0.0
                 action_rate = 0.01
                 action_rate2 = 0.001
-                stand_still = 2.0
+                stand_still = 1.0
                 dof_pos_limits = 0.0
                 feet_contact_forces = 0.0
                 dof_near_home = 0.0
@@ -148,16 +150,11 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         use_cv = False
         inter_nu = 0.2
         beta = "off_policy"
-        storage_size = 4 * 32 * 4096  # num_policies*num_stpes*num_envs
-
-        # Finetuning
-        # learning_rate = 3e-4
-        # max_gradient_steps = 10
-        # batch_size = 4096
+        storage_size = 8 * 32 * 4096  # num_policies*num_stpes*num_envs
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
-        max_iterations = 700  # number of policy updates
-        algorithm_class_name = "PPO2"
+        max_iterations = 1000  # number of policy updates
+        algorithm_class_name = "PPO_IPG"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py
index 1a8af6bf..48b7e47a 100644
--- a/gym/envs/mini_cheetah/minimalist_cheetah.py
+++ b/gym/envs/mini_cheetah/minimalist_cheetah.py
@@ -6,16 +6,23 @@ class MinimalistCheetah:
     Helper class for computing mini cheetah rewards
     """
 
-    def __init__(self, device="cpu", dt=0.01, tracking_sigma=0.25):
+    def __init__(
+        self, device="cpu", tracking_sigma=0.25, ctrl_dt=0.01, ctrl_decimation=5
+    ):
         self.device = device
-        self.dt = dt
         self.tracking_sigma = tracking_sigma
 
+        # Implemented as in legged robot action rate reward
+        self.dt = ctrl_dt * ctrl_decimation
+
         # Default joint angles from mini_cheetah_config.py
         self.default_dof_pos = torch.tensor(
             [0.0, -0.785398, 1.596976], device=self.device
         ).repeat(4)
 
+        # Scales
+        self.command_scales = torch.tensor([3.0, 1.0, 3.0]).to(self.device)
+
         # Previous 2 dof pos targets
         self.dof_target_prev = None
         self.dof_target_prev2 = None
@@ -38,7 +45,10 @@ def set_states(
         self.base_lin_vel = torch.tensor(base_lin_vel, device=self.device).unsqueeze(0)
         self.base_ang_vel = torch.tensor(base_ang_vel, device=self.device).unsqueeze(0)
         self.proj_gravity = torch.tensor(proj_gravity, device=self.device).unsqueeze(0)
-        self.commands = torch.tensor(commands, device=self.device).unsqueeze(0)
+        self.commands = (
+            torch.tensor(commands, device=self.device).unsqueeze(0)
+            * self.command_scales
+        )
         self.dof_pos_obs = torch.tensor(dof_pos_obs, device=self.device).unsqueeze(0)
         self.dof_vel = torch.tensor(dof_vel, device=self.device).unsqueeze(0)
         self.grf = torch.tensor(grf, device=self.device).unsqueeze(0)
@@ -97,14 +107,14 @@ def _reward_orientation(self):
         error = torch.square(self.proj_gravity[:, :2]) / self.tracking_sigma
         return torch.sum(torch.exp(-error), dim=1)
 
-    def _reward_swing_grf(self, contact_thresh=0.5):
+    def _reward_swing_grf(self, contact_thresh=50 / 80):
         """Reward non-zero grf during swing (0 to pi)"""
         in_contact = torch.gt(self.grf, contact_thresh)
         ph_off = torch.gt(self.phase_sin, 0)  # phase <= pi
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
         return -torch.sum(rew.float(), dim=1) * (1 - self._switch())
 
-    def _reward_stance_grf(self, contact_thresh=0.5):
+    def _reward_stance_grf(self, contact_thresh=50 / 80):
         """Reward non-zero grf during stance (pi to 2pi)"""
         in_contact = torch.gt(self.grf, contact_thresh)
         ph_off = torch.lt(self.phase_sin, 0)  # phase >= pi
@@ -119,9 +129,8 @@ def _reward_stand_still(self):
             dim=1,
         )
         rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1)
-        rew_base_vel = 0  # TODO: This seems too noisy
-        # rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1)
-        # rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1)
+        rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1)
+        rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1)
         return (rew_vel + rew_pos - rew_base_vel) * self._switch()
 
     def _reward_action_rate(self):
@@ -137,6 +146,6 @@ def _reward_action_rate2(self):
             torch.square(
                 self.dof_pos_target - 2 * self.dof_target_prev + self.dof_target_prev2
             )
-            / self.dt
+            / self.dt**2
         )
         return -torch.sum(error, dim=1)
diff --git a/learning/algorithms/SE.py b/learning/algorithms/SE.py
index 99dfb75f..9d1f2d2d 100644
--- a/learning/algorithms/SE.py
+++ b/learning/algorithms/SE.py
@@ -3,9 +3,56 @@
 
 from learning.modules import StateEstimatorNN
 from learning.storage import SERolloutStorage
+from learning.utils import create_uniform_generator
 
 
 class StateEstimator:
+    def __init__(
+        self,
+        state_estimator,
+        normalize_obs=True,
+        batch_size=2**15,
+        max_gradient_steps=10,
+        learning_rate=1e-3,
+        device="cpu",
+        **kwargs,
+    ):
+        self.device = device
+
+        self.network = state_estimator.to(self.device)
+
+        self.batch_size = batch_size
+        self.max_gradient_steps = max_gradient_steps
+
+        self.learning_rate = learning_rate
+        self.mean_loss = 0.0
+        self.optimizer = optim.Adam(self.network.parameters(), lr=learning_rate)
+
+    def update(self, data):
+        self.mean_loss = 0
+        counter = 0
+        generator = create_uniform_generator(
+            data, self.batch_size, self.max_gradient_steps
+        )
+        for batch in generator:
+            loss = nn.functional.mse_loss(
+                self.network.evaluate(batch["SE_obs"]), batch["SE_targets"]
+            )
+            self.optimizer.zero_grad()
+            loss.backward()
+            self.optimizer.step()
+            self.mean_loss += loss.item()
+            counter += 1
+        self.mean_loss /= counter
+
+    def estimate(self, obs):
+        return self.network.evaluate(obs)
+
+    def export(self, path):
+        self.network.export(path)
+
+
+class OldStateEstimator:
     """This class provides a learned state estimator.
     This is trained with supervised learning, using only on-policy data
     collected in a rollout storage.
diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py
index e377579f..e15ab46d 100644
--- a/learning/algorithms/ppo_ipg.py
+++ b/learning/algorithms/ppo_ipg.py
@@ -107,9 +107,6 @@ def update(self, data_onpol, data_offpol):
         for param in self.critic_q.parameters():
             param.requires_grad = True
 
-        # print("On-policy data shape: ", data_onpol.shape)
-        # print("Off-policy data shape: ", data_offpol.shape)
-
     def update_critic_q(self, data):
         self.mean_q_loss = 0
         counter = 0
@@ -121,7 +118,6 @@ def update_critic_q(self, data):
         )
         for batch in generator:
             with torch.no_grad():
-                # TODO: check that should be inference
                 action_next = self.actor.act_inference(batch["next_actor_obs"])
                 q_input_next = torch.cat(
                     (batch["next_critic_obs"], action_next), dim=-1
@@ -252,7 +248,7 @@ def update_actor(self, data_onpol, data_offpol):
             else:
                 b = self.inter_nu
 
-            loss = loss_onpol + b * loss_offpol
+            loss = loss_onpol + b * loss_offpol.requires_grad_()
 
             # * Gradient step
             self.optimizer.zero_grad()
diff --git a/learning/modules/state_estimator.py b/learning/modules/state_estimator.py
index f8cfe221..26a606f7 100644
--- a/learning/modules/state_estimator.py
+++ b/learning/modules/state_estimator.py
@@ -1,6 +1,5 @@
 import torch.nn as nn
-from .utils import create_MLP
-from .utils import export_network
+from .utils import create_MLP, export_network, RunningMeanStd
 
 
 class StateEstimatorNN(nn.Module):
@@ -21,6 +20,7 @@ def __init__(
         hidden_dims=[256, 128],
         activation="elu",
         dropouts=None,
+        normalize_obs=True,
         **kwargs,
     ):
         if kwargs:
@@ -30,13 +30,19 @@ def __init__(
             )
         super().__init__()
 
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_inputs)
+
         self.num_inputs = num_inputs
         self.num_outputs = num_outputs
         self.NN = create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts)
         print(f"State Estimator MLP: {self.NN}")
 
-    def evaluate(self, observations):
-        return self.NN(observations)
+    def evaluate(self, obs):
+        if self._normalize_obs:
+            obs = self.obs_rms(obs)
+        return self.NN(obs)
 
     def export(self, path):
         export_network(self.NN, "state_estimator", path, self.num_inputs)
diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py
index 127d95d4..bc8a0eeb 100644
--- a/learning/runners/finetune_runner.py
+++ b/learning/runners/finetune_runner.py
@@ -1,12 +1,15 @@
-import torch
-from tensordict import TensorDict
 from learning.algorithms import *  # noqa: F403
-from learning.modules import Actor, SmoothActor, Critic
-from learning.utils import remove_zero_weighted_rewards
+from learning.algorithms import StateEstimator
+from learning.modules import Actor, SmoothActor, Critic, StateEstimatorNN
 from learning.storage import DictStorage
-
+from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah
 from .BaseRunner import BaseRunner
 
+import torch
+import os
+import scipy.io
+from tensordict import TensorDict
+
 sim_storage = DictStorage()
 
 
@@ -15,17 +18,22 @@ def __init__(
         self,
         env,
         train_cfg,
-        data_onpol,
-        data_offpol=None,
+        log_dir,
+        data_list,
+        data_length=2990,
+        data_name="SMOOTH_RL_CONTROLLER",
         exploration_scale=1.0,
         device="cpu",
     ):
-        self.data_onpol = data_onpol
-        self.data_offpol = data_offpol
-        self.exploration_scale = exploration_scale
         # Init calls _set_up_alg which needs exploration scale
+        self.exploration_scale = exploration_scale
         super().__init__(env, train_cfg, device)
 
+        self.log_dir = log_dir
+        self.data_list = data_list  # Describes structure of Robot-Software logs
+        self.data_length = data_length  # Logs must contain at least this many steps
+        self.data_name = data_name
+
     def _set_up_alg(self):
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
         num_actions = self.get_action_size(self.actor_cfg["actions"])
@@ -40,8 +48,21 @@ def _set_up_alg(self):
         else:
             actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
 
+        if "state_estimator" in self.train_cfg.keys():
+            self.se_cfg = self.train_cfg["state_estimator"]
+            state_estimator_network = StateEstimatorNN(
+                self.get_obs_size(self.se_cfg["obs"]),
+                self.get_obs_size(self.se_cfg["targets"]),
+                **self.se_cfg["network"],
+            )
+            self.SE = StateEstimator(
+                state_estimator_network, device=self.device, **self.se_cfg
+            )
+        else:
+            self.SE = None
+
         alg_class = eval(self.cfg["algorithm_class_name"])
-        if self.data_offpol is not None:
+        if self.cfg["algorithm_class_name"] == "PPO_IPG":
             critic_v = Critic(num_critic_obs, **self.critic_cfg)
             critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
             target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
@@ -60,9 +81,137 @@ def _set_up_alg(self):
     def parse_train_cfg(self, train_cfg):
         self.cfg = train_cfg["runner"]
         self.alg_cfg = train_cfg["algorithm"]
-        remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"])
         self.actor_cfg = train_cfg["actor"]
         self.critic_cfg = train_cfg["critic"]
+        self.train_cfg = train_cfg
+
+    def get_data_dict(self, offpol=False):
+        checkpoint = self.cfg["checkpoint"]
+        if offpol:
+            # All files up until checkpoint
+            log_files = [
+                file
+                for file in os.listdir(self.log_dir)
+                if file.endswith(".mat") and int(file.split(".")[0]) <= checkpoint
+            ]
+            log_files = sorted(log_files)
+        else:
+            # Single log file for checkpoint
+            log_files = [str(checkpoint) + ".mat"]
+
+        # Initialize data dict
+        data = scipy.io.loadmat(os.path.join(self.log_dir, log_files[0]))
+        batch_size = (self.data_length - 1, len(log_files))  # -1 for next_obs
+        data_dict = TensorDict({}, device=self.device, batch_size=batch_size)
+
+        # Collect all data
+        actor_obs_all = torch.empty(0).to(self.device)
+        critic_obs_all = torch.empty(0).to(self.device)
+        actions_all = torch.empty(0).to(self.device)
+        rewards_all = torch.empty(0).to(self.device)
+        for log in log_files:
+            data = scipy.io.loadmat(os.path.join(self.log_dir, log))
+            self.data_struct = data[self.data_name][0][0]
+            if self.SE:
+                self.update_state_estimates()
+
+            actor_obs = self.get_data_obs(self.actor_cfg["obs"], self.data_struct)
+            critic_obs = self.get_data_obs(self.critic_cfg["obs"], self.data_struct)
+            actor_obs_all = torch.cat((actor_obs_all, actor_obs), dim=1)
+            critic_obs_all = torch.cat((critic_obs_all, critic_obs), dim=1)
+
+            actions_idx = self.data_list.index("dof_pos_target")
+            actions = (
+                torch.tensor(self.data_struct[actions_idx]).to(self.device).float()
+            )
+            actions = actions[: self.data_length]
+            actions = actions.reshape(
+                (self.data_length, 1, -1)
+            )  # shape (data_length, 1, n)
+            actions_all = torch.cat((actions_all, actions), dim=1)
+
+            reward_weights = self.critic_cfg["reward"]["weights"]
+            rewards, _ = self.get_data_rewards(self.data_struct, reward_weights)
+            rewards = rewards[: self.data_length]
+            rewards = rewards.reshape((self.data_length, 1))  # shape (data_length, 1)
+            rewards_all = torch.cat((rewards_all, rewards), dim=1)
+
+        data_dict["actor_obs"] = actor_obs_all[:-1]
+        data_dict["next_actor_obs"] = actor_obs_all[1:]
+        data_dict["critic_obs"] = critic_obs_all[:-1]
+        data_dict["next_critic_obs"] = critic_obs_all[1:]
+        data_dict["actions"] = actions_all[:-1]
+        data_dict["rewards"] = rewards_all[:-1]
+
+        # No time outs and dones
+        data_dict["timed_out"] = torch.zeros(batch_size, device=self.device, dtype=bool)
+        data_dict["dones"] = torch.zeros(batch_size, device=self.device, dtype=bool)
+
+        return data_dict
+
+    def get_data_obs(self, obs_list, data_struct):
+        obs_all = torch.empty(0).to(self.device)
+        for obs_name in obs_list:
+            data_idx = self.data_list.index(obs_name)
+            obs = torch.tensor(data_struct[data_idx]).to(self.device)
+            obs = obs.squeeze()[: self.data_length]
+            obs = obs.reshape((self.data_length, 1, -1))  # shape (data_length, 1, n)
+            obs_all = torch.cat((obs_all, obs), dim=-1)
+
+        return obs_all.float()
+
+    def get_data_rewards(self, data_struct, reward_weights, dt=0.01):
+        minimalist_cheetah = MinimalistCheetah(device=self.device)
+        rewards_dict = {name: [] for name in reward_weights.keys()}  # for plotting
+        rewards_all = torch.empty(0).to(self.device)
+
+        for i in range(self.data_length):
+            minimalist_cheetah.set_states(
+                base_height=data_struct[1][i],
+                base_lin_vel=data_struct[2][i],
+                base_ang_vel=data_struct[3][i],
+                proj_gravity=data_struct[4][i],
+                commands=data_struct[5][i],
+                dof_pos_obs=data_struct[6][i],
+                dof_vel=data_struct[7][i],
+                phase_obs=data_struct[8][i],
+                grf=data_struct[9][i],
+                dof_pos_target=data_struct[10][i],
+            )
+            total_rewards = 0
+            for name, weight in reward_weights.items():
+                reward = weight * eval(f"minimalist_cheetah._reward_{name}()")
+                rewards_dict[name].append(reward.item())
+                total_rewards += reward
+            rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
+            # Post process mini cheetah
+            minimalist_cheetah.post_process()
+
+        rewards_dict["total"] = rewards_all.tolist()
+        rewards_all *= dt  # scaled for alg update
+
+        return rewards_all.float(), rewards_dict
+
+    def update_state_estimates(self):
+        se_obs = torch.empty(0).to(self.device)
+        for obs in self.se_cfg["obs"]:
+            data_idx = self.data_list.index(obs)
+            data = torch.tensor(self.data_struct[data_idx]).to(self.device)
+            data = data.squeeze()[: self.data_length]
+            data = data.reshape((self.data_length, -1))
+            se_obs = torch.cat((se_obs, data), dim=-1)
+
+        se_targets = self.SE.estimate(se_obs.float())
+
+        # Overwrite data struct with state estimates
+        idx = 0
+        for target in self.se_cfg["targets"]:
+            data_idx = self.data_list.index(target)
+            dim = self.data_struct[data_idx].shape[1]
+            self.data_struct[data_idx] = (
+                se_targets[:, idx : idx + dim].cpu().detach().numpy()
+            )
+            idx += dim
 
     def learn(self):
         self.alg.switch_to_train()
@@ -74,12 +223,15 @@ def learn(self):
                 batch_size=(self.num_steps_per_env, self.env.num_envs),
                 device=self.device,
             )
+        else:
+            self.data_onpol = self.get_data_dict()
 
         # Single alg update on data
-        if self.data_offpol is None:
-            self.alg.update(self.data_onpol)
-        else:
+        if self.cfg["algorithm_class_name"] == "PPO_IPG":
+            self.data_offpol = self.get_data_dict(offpol=True)
             self.alg.update(self.data_onpol, self.data_offpol)
+        else:
+            self.alg.update(self.data_onpol)
 
     def get_sim_data(self):
         rewards_dict = {}
@@ -224,6 +376,10 @@ def load(self, path, load_optimizer=True):
                     loaded_dict["critic_optimizer_state_dict"]
                 )
 
+    def load_se(self, se_path):
+        se_dict = torch.load(se_path)
+        self.SE.network.load_state_dict(se_dict["SE_state_dict"])
+
     def export(self, path):
         # Need to make a copy of actor
         if self.actor_cfg["smooth_exploration"]:
diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py
index 4b7b5dfe..e46f5665 100644
--- a/learning/runners/ipg_runner.py
+++ b/learning/runners/ipg_runner.py
@@ -173,7 +173,7 @@ def learn(self):
             logger.toc("runtime")
             logger.print_to_terminal()
 
-        self.save(end=True)
+        self.save()
 
     @torch.no_grad
     def burn_in_normalization(self, n_iterations=100):
@@ -232,7 +232,7 @@ def set_up_logger(self):
             (self.alg.actor, self.alg.critic_v, self.alg.critic_q)
         )
 
-    def save(self, end=False):
+    def save(self, save_storage=False):
         os.makedirs(self.log_dir, exist_ok=True)
         path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
         torch.save(
@@ -247,13 +247,11 @@ def save(self, end=False):
             },
             path,
         )
-        # Save data
-        path_offpol = os.path.join(self.log_dir, "data_offpol_{}".format(self.it))
-        torch.save(storage_offpol.get_data().cpu(), path_offpol + ".pt")
-        # When training ends the on-policy data was cleared
-        if not end:
+        if save_storage:
             path_onpol = os.path.join(self.log_dir, "data_onpol_{}".format(self.it))
+            path_offpol = os.path.join(self.log_dir, "data_offpol_{}".format(self.it))
             torch.save(storage_onpol.data.cpu(), path_onpol + ".pt")
+            torch.save(storage_offpol.get_data().cpu(), path_offpol + ".pt")
 
     def load(self, path, load_optimizer=True, load_actor_std=True):
         loaded_dict = torch.load(path)
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 6823a6d4..ffad7a99 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -143,7 +143,7 @@ def learn(self, states_to_log_dict=None):
             logger.toc("runtime")
             logger.print_to_terminal()
 
-        self.save(end=True)
+        self.save()
 
     @torch.no_grad
     def burn_in_normalization(self, n_iterations=100):
@@ -190,7 +190,7 @@ def set_up_logger(self):
 
         logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 
-    def save(self, end=False):
+    def save(self, save_storage=False):
         os.makedirs(self.log_dir, exist_ok=True)
         path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
         torch.save(
@@ -203,8 +203,7 @@ def save(self, end=False):
             },
             path,
         )
-        # Save data if training hasn't ended, otherwise storage was cleared
-        if not end:
+        if save_storage:
             path_data = os.path.join(self.log_dir, "data_{}".format(self.it))
             torch.save(storage.data.cpu(), path_data + ".pt")
 
diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py
index 0e0de257..87f60071 100644
--- a/scripts/eval_rewards.py
+++ b/scripts/eval_rewards.py
@@ -1,29 +1,33 @@
-from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah
+from gym.envs import __init__  # noqa: F401
+from gym.utils import get_args, task_registry
+from gym.utils.helpers import class_to_dict
+
+from learning.runners.finetune_runner import FineTuneRunner
 
 from gym import LEGGED_GYM_ROOT_DIR
 
 import os
-import torch
 import scipy.io
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
-LOAD_RUN = "Jul12_15-53-57_IPG32_S16"
+SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt"  # if None: no SE
+LOAD_RUN = "Jul23_00-14-23_nu02_B8"
 
-REWARDS_FILE = "rewards_nu1.csv"  # generate this file from logs, if None: just plot
+REWARDS_FILE = None  # generate this file from logs, if None: just plot
 PLOT_REWARDS = {
+    "Nu=0.0": "rewards_nu0.csv",
     "Nu=0.5": "rewards_nu05.csv",
-    "Nu=0.8": "rewards_nu08.csv",
-    "Nu=1.0": "rewards_nu1.csv",
+    "Nu=0.9": "rewards_nu09.csv",
+    "Nu=0.9 new": "rewards_nu09_new.csv",
 }
 
 # Data struct fields from Robot-Software logs
 DATA_LIST = [
     "header",
-    "base_height",  # 1 shape: (1, data_length)
-    # the following are all shape: (data_length, n)
+    "base_height",  # 1
     "base_lin_vel",  # 2
     "base_ang_vel",  # 3
     "projected_gravity",  # 4
@@ -52,59 +56,7 @@
 DEVICE = "cuda"
 
 
-def get_rewards(it, data_struct, rewards_path):
-    data_length = data_struct[1].shape[1]  # base_height: shape (1, data_length)
-
-    mini_cheetah = MinimalistCheetah(device=DEVICE)
-    rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()}  # for plotting
-    rewards_all = torch.empty(0).to(DEVICE)
-
-    for i in range(data_length):
-        mini_cheetah.set_states(
-            base_height=data_struct[1][:, i],  # shape (1, data_length)
-            base_lin_vel=data_struct[2][i],
-            base_ang_vel=data_struct[3][i],
-            proj_gravity=data_struct[4][i],
-            commands=data_struct[5][i],
-            dof_pos_obs=data_struct[6][i],
-            dof_vel=data_struct[7][i],
-            phase_obs=data_struct[8][i],
-            grf=data_struct[9][i],
-            dof_pos_target=data_struct[10][i],
-        )
-        total_rewards = 0
-        for name, weight in REWARD_WEIGHTS.items():
-            reward = weight * eval(f"mini_cheetah._reward_{name}()")
-            rewards_dict[name].append(reward.item())
-            total_rewards += reward
-        rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
-        # Post process mini cheetah
-        mini_cheetah.post_process()
-
-    rewards_dict["total"] = rewards_all.tolist()
-
-    # Save rewards in dataframe
-    if not os.path.exists(rewards_path):
-        rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"])
-    else:
-        rewards_df = pd.read_csv(rewards_path)
-    for name, rewards in rewards_dict.items():
-        rewards = np.array(rewards)
-        mean = rewards.mean()
-        std = rewards.std()
-        rewards_df = rewards_df._append(
-            {
-                "iteration": it,
-                "type": name,
-                "mean": mean,
-                "std": std,
-            },
-            ignore_index=True,
-        )
-    rewards_df.to_csv(rewards_path, index=False)
-
-
-def get_data_dict(name="SMOOTH_RL_CONTROLLER"):
+def update_rewards_df(runner):
     data_dict = {}
     log_files = [
         file
@@ -116,8 +68,35 @@ def get_data_dict(name="SMOOTH_RL_CONTROLLER"):
         iteration = int(file.split(".")[0])
         path = os.path.join(ROOT_DIR, LOAD_RUN, file)
         data = scipy.io.loadmat(path)
-        data_dict[iteration] = data[name][0][0]
-    return data_dict
+        data_dict[iteration] = data[runner.data_name][0][0]
+
+    rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
+    if os.path.exists(rewards_path):
+        os.remove(rewards_path)
+
+    # Get rewards from runner
+    for it, data_struct in data_dict.items():
+        _, rewards_dict = runner.get_data_rewards(data_struct, REWARD_WEIGHTS)
+
+        # Save rewards in dataframe
+        if not os.path.exists(rewards_path):
+            rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"])
+        else:
+            rewards_df = pd.read_csv(rewards_path)
+        for name, rewards in rewards_dict.items():
+            rewards = np.array(rewards)
+            mean = rewards.mean()
+            std = rewards.std()
+            rewards_df = rewards_df._append(
+                {
+                    "iteration": it,
+                    "type": name,
+                    "mean": mean,
+                    "std": std,
+                },
+                ignore_index=True,
+            )
+        rewards_df.to_csv(rewards_path, index=False)
 
 
 def plot_rewards(rewards_df, axs, name):
@@ -154,20 +133,39 @@ def plot_rewards(rewards_df, axs, name):
     axs[i].legend()
 
 
+def setup():
+    args = get_args()
+
+    env_cfg, train_cfg = task_registry.create_cfgs(args)
+    task_registry.make_gym_and_sim()
+    env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
+
+    train_cfg = class_to_dict(train_cfg)
+    log_dir = os.path.join(ROOT_DIR, train_cfg["runner"]["load_run"])
+
+    runner = FineTuneRunner(
+        env,
+        train_cfg,
+        log_dir,
+        data_list=DATA_LIST,
+        device=DEVICE,
+    )
+
+    return runner
+
+
 if __name__ == "__main__":
     if REWARDS_FILE is not None:
-        data_dict = get_data_dict()
-        rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
-        if os.path.exists(rewards_path):
-            os.remove(rewards_path)
-        for it, data_struct in data_dict.items():
-            get_rewards(it, data_struct, rewards_path)
+        runner = setup()
+        if SE_PATH is not None:
+            runner.load_se(SE_PATH)
+        update_rewards_df(runner)
 
     # Plot rewards stats
     num_plots = len(REWARD_WEIGHTS) + 1  # +1 for total rewards
     cols = 5
     rows = np.ceil(num_plots / cols).astype(int)
-    fig, axs = plt.subplots(rows, cols, figsize=(30, 10))
+    fig, axs = plt.subplots(rows, cols, figsize=(20, 8))
     fig.suptitle("IPG Finetuning Rewards")
     axs = axs.flatten()
 
diff --git a/scripts/finetune.py b/scripts/finetune.py
index ca7d404a..6247a466 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -2,34 +2,28 @@
 from gym.utils import get_args, task_registry
 from gym.utils.helpers import class_to_dict
 
-from learning.algorithms import *  # noqa: F403
 from learning.runners.finetune_runner import FineTuneRunner
-from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah
 
 from gym import LEGGED_GYM_ROOT_DIR
 
 import os
 import torch
-import scipy.io
 import numpy as np
-from tensordict import TensorDict
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
+SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt"  # if None: no SE
 OUTPUT_FILE = "output.txt"
 
 USE_SIMULATOR = True
-DATA_LENGTH = 2990  # Robot-Software logs must contain at least this many steps
 
 # Scales
-EXPLORATION_SCALE = 0.8  # used during data collection
+EXPLORATION_SCALE = 0.5  # used during data collection
 ACTION_SCALES = np.tile(np.array([0.2, 0.3, 0.3]), 4)
-COMMAND_SCALES = np.array([3.0, 1.0, 3.0])
 
 # Data struct fields from Robot-Software logs
 DATA_LIST = [
     "header",
-    "base_height",  # 1 shape: (1, data_length)
-    # the following are all shape: (data_length, n)
+    "base_height",  # 1
     "base_lin_vel",  # 2
     "base_ang_vel",  # 3
     "projected_gravity",  # 4
@@ -39,141 +33,29 @@
     "phase_obs",  # 8
     "grf",  # 9
     "dof_pos_target",  # 10
-    "exploration_noise",  # 11
+    "torques",  # 11
+    "exploration_noise",  # 12
     "footer",
 ]
 
 DEVICE = "cuda"
 
 
-def get_obs(obs_list, data_struct):
-    obs_all = torch.empty(0).to(DEVICE)
-    for obs_name in obs_list:
-        data_idx = DATA_LIST.index(obs_name)
-        obs = torch.tensor(data_struct[data_idx]).to(DEVICE)
-        obs = obs.squeeze()[:DATA_LENGTH]
-        obs = obs.reshape((DATA_LENGTH, 1, -1))  # shape (data_length, 1, n)
-        obs_all = torch.cat((obs_all, obs), dim=-1)
-
-    return obs_all.float()
-
-
-def get_rewards(data_struct, reward_weights, dt):
-    minimalist_cheetah = MinimalistCheetah(device=DEVICE)
-    rewards_all = torch.empty(0).to(DEVICE)
-
-    for i in range(DATA_LENGTH):
-        minimalist_cheetah.set_states(
-            base_height=data_struct[1][:, i],  # shape (1, data_length)
-            base_lin_vel=data_struct[2][i],
-            base_ang_vel=data_struct[3][i],
-            proj_gravity=data_struct[4][i],
-            commands=data_struct[5][i] * COMMAND_SCALES,
-            dof_pos_obs=data_struct[6][i],
-            dof_vel=data_struct[7][i],
-            phase_obs=data_struct[8][i],
-            grf=data_struct[9][i],
-            dof_pos_target=data_struct[10][i],
-        )
-        total_rewards = 0
-        for name, weight in reward_weights.items():
-            reward = weight * eval(f"minimalist_cheetah._reward_{name}()")
-            total_rewards += dt * reward  # scaled
-        rewards_all = torch.cat((rewards_all, total_rewards), dim=0)
-        # Post process mini cheetah
-        minimalist_cheetah.post_process()
-
-    return rewards_all.float()
-
-
-def get_data_dict(train_cfg, env_cfg, offpol=False, name="SMOOTH_RL_CONTROLLER"):
-    load_run = train_cfg["runner"]["load_run"]
-    checkpoint = train_cfg["runner"]["checkpoint"]
-    run_dir = os.path.join(ROOT_DIR, load_run)
-
-    if offpol:
-        # All files up until checkpoint
-        log_files = [
-            file
-            for file in os.listdir(run_dir)
-            if file.endswith(".mat") and int(file.split(".")[0]) <= checkpoint
-        ]
-        log_files = sorted(log_files)
-    else:
-        # Single log file for checkpoint
-        log_files = [str(checkpoint) + ".mat"]
-
-    # Initialize data dict
-    data = scipy.io.loadmat(os.path.join(run_dir, log_files[0]))
-    batch_size = (DATA_LENGTH - 1, len(log_files))  # -1 for next_obs
-    data_dict = TensorDict({}, device=DEVICE, batch_size=batch_size)
-
-    # Get all data
-    actor_obs_all = torch.empty(0).to(DEVICE)
-    critic_obs_all = torch.empty(0).to(DEVICE)
-    actions_all = torch.empty(0).to(DEVICE)
-    rewards_all = torch.empty(0).to(DEVICE)
-    for log in log_files:
-        data = scipy.io.loadmat(os.path.join(run_dir, log))
-        data_struct = data[name][0][0]
-
-        actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct)
-        critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct)
-        actor_obs_all = torch.cat((actor_obs_all, actor_obs), dim=1)
-        critic_obs_all = torch.cat((critic_obs_all, critic_obs), dim=1)
-
-        actions_idx = DATA_LIST.index("dof_pos_target")
-        actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float()
-        actions = actions[:DATA_LENGTH]
-        actions = actions.reshape((DATA_LENGTH, 1, -1))  # shape (data_length, 1, n)
-        actions_all = torch.cat((actions_all, actions), dim=1)
-
-        reward_weights = train_cfg["critic"]["reward"]["weights"]
-        dt = 1.0 / env_cfg["control"]["ctrl_frequency"]
-        rewards = get_rewards(data_struct, reward_weights, dt)
-        rewards = rewards[:DATA_LENGTH]
-        rewards = rewards.reshape((DATA_LENGTH, 1))  # shape (data_length, 1)
-        rewards_all = torch.cat((rewards_all, rewards), dim=1)
-
-    data_dict["actor_obs"] = actor_obs_all[:-1]
-    data_dict["next_actor_obs"] = actor_obs_all[1:]
-    data_dict["critic_obs"] = critic_obs_all[:-1]
-    data_dict["next_critic_obs"] = critic_obs_all[1:]
-    data_dict["actions"] = actions_all[:-1]
-    data_dict["rewards"] = rewards_all[:-1]
-
-    # No time outs and dones
-    data_dict["timed_out"] = torch.zeros(batch_size, device=DEVICE, dtype=bool)
-    data_dict["dones"] = torch.zeros(batch_size, device=DEVICE, dtype=bool)
-
-    return data_dict
-
-
 def setup():
     args = get_args()
 
     env_cfg, train_cfg = task_registry.create_cfgs(args)
     task_registry.make_gym_and_sim()
-
-    if USE_SIMULATOR:
-        env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
-    else:
-        env = None
+    env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
 
     train_cfg = class_to_dict(train_cfg)
-    env_cfg = class_to_dict(env_cfg)
-    data_onpol = get_data_dict(train_cfg, env_cfg, offpol=False)
-
-    if train_cfg["runner"]["algorithm_class_name"] == "PPO_IPG":
-        data_offpol = get_data_dict(train_cfg, env_cfg, offpol=True)
-    else:
-        data_offpol = None
+    log_dir = os.path.join(ROOT_DIR, train_cfg["runner"]["load_run"])
 
     runner = FineTuneRunner(
         env,
         train_cfg,
-        data_onpol,
-        data_offpol,
+        log_dir,
+        data_list=DATA_LIST,
         exploration_scale=EXPLORATION_SCALE,
         device=DEVICE,
     )
@@ -187,6 +69,9 @@ def finetune(runner):
     load_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt")
     runner.load(load_path)
 
+    if SE_PATH is not None:
+        runner.load_se(SE_PATH)
+
     # Perform a single update
     runner.learn()
 
diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh
index fa81966f..95e84728 100644
--- a/scripts/finetune_multiple.sh
+++ b/scripts/finetune_multiple.sh
@@ -2,9 +2,10 @@
 source /home/lmolnar/miniconda3/etc/profile.d/conda.sh
 
 # Args
-LOAD_RUN="Jul12_15-53-57_IPG32_S16"
-CHECKPOINT=700
-N_RUNS=5
+LOAD_RUN="Jul23_00-14-23_nu02_B8"
+CHECKPOINT=1000
+N_RUNS=6
+EVAL=false  # make sure to turn exploration on/off in RS
 
 # Set directories
 QGYM_DIR="/home/lmolnar/workspace/QGym"
@@ -39,9 +40,11 @@ do
     cp ${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat ${QGYM_LOG_DIR}
 
     # Finetune in QGym
-    conda deactivate
-    conda activate qgym
-    python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT}
+    if [ "$EVAL" = false ] ; then
+        conda deactivate
+        conda activate qgym
+        python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT}
+    fi
 
     # Copy policy to RS
     CHECKPOINT=$((CHECKPOINT + 1))
diff --git a/scripts/generate_commands.py b/scripts/generate_commands.py
index c1de1425..e8ef18a4 100644
--- a/scripts/generate_commands.py
+++ b/scripts/generate_commands.py
@@ -1,10 +1,15 @@
 import numpy as np
 import random
 
-# Command ranges
-x_range = [-0.5, 1]  # [m/s]
-y_range = [-0.25, 0.25]  # [m/s]
-yaw_range = [-1, 1]  # [rad/s]
+# Command ranges during training:
+# x_range = [-2.0, 3.0]  # [m/s]
+# y_range = [-1.0, 1.0]  # [m/s]
+# yaw_range = [-3.0. 3.0]  # [rad/s]
+
+# Command ranges for finetuning:
+x_range = [-1.0, 1.5]  # [m/s]
+y_range = [-0.5, 0.5]  # [m/s]
+yaw_range = [-1.5, 1.5]  # [rad/s]
 
 # Generate random command sequence
 N = 10

From edf31679a25e106ca82b0f6690f07e3118d24ed9 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Tue, 23 Jul 2024 18:11:50 -0400
Subject: [PATCH 40/42] bugfix scale action noise, added scaling and
 refactorings

---
 gym/envs/base/task_skeleton.py              |  3 ++-
 gym/envs/mini_cheetah/minimalist_cheetah.py | 13 ++++++++++---
 learning/algorithms/ppo_ipg.py              |  2 +-
 learning/modules/smooth_actor.py            |  2 +-
 scripts/train.py                            |  5 ++---
 5 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py
index ffbac6ce..0974970a 100644
--- a/gym/envs/base/task_skeleton.py
+++ b/gym/envs/base/task_skeleton.py
@@ -45,6 +45,7 @@ def reset(self):
         """Reset all robots"""
         self._reset_idx(torch.arange(self.num_envs, device=self.device))
         self.step()
+        self.episode_length_buf[:] = 0
 
     def _reset_buffers(self):
         self.to_be_reset[:] = False
@@ -67,7 +68,7 @@ def _eval_reward(self, name):
     def _check_terminations_and_timeouts(self):
         """Check if environments need to be reset"""
         contact_forces = self.contact_forces[:, self.termination_contact_indices, :]
-        self.terminated = torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
+        self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
         self.timed_out = self.episode_length_buf >= self.max_episode_length
         self.to_be_reset = self.timed_out | self.terminated
 
diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py
index 48b7e47a..4702f6ac 100644
--- a/gym/envs/mini_cheetah/minimalist_cheetah.py
+++ b/gym/envs/mini_cheetah/minimalist_cheetah.py
@@ -22,6 +22,8 @@ def __init__(
 
         # Scales
         self.command_scales = torch.tensor([3.0, 1.0, 3.0]).to(self.device)
+        self.dof_pos_scales = torch.tensor([0.2, 0.3, 0.3]).to(self.device).repeat(4)
+        self.dof_vel_scales = torch.tensor([2.0, 2.0, 4.0]).to(self.device).repeat(4)
 
         # Previous 2 dof pos targets
         self.dof_target_prev = None
@@ -49,15 +51,20 @@ def set_states(
             torch.tensor(commands, device=self.device).unsqueeze(0)
             * self.command_scales
         )
-        self.dof_pos_obs = torch.tensor(dof_pos_obs, device=self.device).unsqueeze(0)
-        self.dof_vel = torch.tensor(dof_vel, device=self.device).unsqueeze(0)
+        self.dof_pos_obs = (
+            torch.tensor(dof_pos_obs, device=self.device).unsqueeze(0)
+            * self.dof_pos_scales
+        )
+        self.dof_vel = (
+            torch.tensor(dof_vel, device=self.device).unsqueeze(0) * self.dof_vel_scales
+        )
         self.grf = torch.tensor(grf, device=self.device).unsqueeze(0)
 
         # Compute phase sin
         phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0)
         self.phase_sin = phase_obs[:, 0].unsqueeze(0)
 
-        # Set targets
+        # Set targets, these are scaled in Robot-Software
         self.dof_pos_target = torch.tensor(
             dof_pos_target, device=self.device
         ).unsqueeze(0)
diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py
index e15ab46d..0dd7907f 100644
--- a/learning/algorithms/ppo_ipg.py
+++ b/learning/algorithms/ppo_ipg.py
@@ -256,7 +256,7 @@ def update_actor(self, data_onpol, data_offpol):
             nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
             self.optimizer.step()
             self.mean_surrogate_loss += loss_onpol.item()
-            self.mean_offpol_loss += loss_offpol.item()
+            self.mean_offpol_loss += b * loss_offpol.item()
             counter += 1
         self.mean_surrogate_loss /= counter
         self.mean_offpol_loss /= counter
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index 72c4ace1..f824d263 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -107,7 +107,7 @@ def update_distribution(self, observations):
     def act(self, observations):
         self.update_distribution(observations)
         mean = self.distribution.mean
-        sample = mean + self.get_noise()
+        sample = mean + self.get_noise() * self.exploration_scale
         if self.debug:
             path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv"
             self.log_actions(mean[0][2], sample[0][2], path)
diff --git a/scripts/train.py b/scripts/train.py
index 283717e8..40966682 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -1,5 +1,5 @@
 from gym.envs import __init__  # noqa: F401
-from gym.utils import get_args, task_registry, randomize_episode_counters
+from gym.utils import get_args, task_registry
 from gym.utils.logging_and_saving import wandb_singleton
 from gym.utils.logging_and_saving import local_code_save_helper
 
@@ -12,9 +12,8 @@ def setup():
     task_registry.make_gym_and_sim()
     wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args)
     env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
-    randomize_episode_counters(env)
+    # randomize_episode_counters(env)
 
-    randomize_episode_counters(env)
     policy_runner = task_registry.make_alg_runner(env, train_cfg)
 
     local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir)

From 276f84b0bc59bc2e1c2398f7d44e7c50532b221b Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 26 Jul 2024 11:23:11 -0400
Subject: [PATCH 41/42] merged LinkedIPG, bugfix target network save,
 refactorings

---
 gym/envs/pendulum/pendulum_config.py |   7 +-
 gym/utils/helpers.py                 |   8 +
 learning/algorithms/__init__.py      |   1 +
 learning/algorithms/linked_ipg.py    | 316 +++++++++++++++++++++++++++
 learning/algorithms/ppo_ipg.py       |  12 +-
 learning/runners/finetune_runner.py  | 114 +++++++---
 learning/runners/ipg_runner.py       |  12 +-
 scripts/eval_rewards.py              |  24 +-
 scripts/finetune.py                  |  69 +++++-
 scripts/finetune_multiple.sh         |  20 +-
 scripts/visualize_ipg.py             |   6 +-
 11 files changed, 508 insertions(+), 81 deletions(-)
 create mode 100644 learning/algorithms/linked_ipg.py

diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py
index 399060a2..51c44557 100644
--- a/gym/envs/pendulum/pendulum_config.py
+++ b/gym/envs/pendulum/pendulum_config.py
@@ -125,16 +125,17 @@ class algorithm(FixedRobotCfgPPO.algorithm):
         desired_kl = 0.01
 
         # IPG
-        polyak = 0.995
+        polyak = 0.9
         use_cv = False
-        inter_nu = 0.2
+        inter_nu = 0.5
         beta = "off_policy"
         storage_size = 4 * 100 * 4096  # num_policies*num_steps*num_envs
+        val_interpolation = 0.5  # 0: use V(s'), 1: use Q(s', pi(s'))
 
     class runner(FixedRobotCfgPPO.runner):
         run_name = ""
         experiment_name = "pendulum"
         max_iterations = 100  # number of policy updates
-        algorithm_class_name = "PPO_IPG"
+        algorithm_class_name = "LinkedIPG"
         num_steps_per_env = 100
         save_interval = 20
diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py
index 90af8bce..2f3a5dcf 100644
--- a/gym/utils/helpers.py
+++ b/gym/utils/helpers.py
@@ -181,6 +181,9 @@ def update_cfg_from_args(env_cfg, train_cfg, args):
             train_cfg.runner.checkpoint = args.checkpoint
         if args.rl_device is not None:
             train_cfg.runner.device = args.rl_device
+        # * IPG parameters
+        if args.inter_nu is not None:
+            train_cfg.algorithm.inter_nu = args.inter_nu
 
 
 def get_args(custom_parameters=None):
@@ -301,6 +304,11 @@ def get_args(custom_parameters=None):
             "default": False,
             "help": "Use original config file for loaded policy.",
         },
+        {
+            "name": "--inter_nu",
+            "type": float,
+            "help": "Interpolation parameter for IPG.",
+        },
     ]
     # * parse arguments
     args = gymutil.parse_arguments(
diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py
index b775a439..8bee38d0 100644
--- a/learning/algorithms/__init__.py
+++ b/learning/algorithms/__init__.py
@@ -33,4 +33,5 @@
 from .ppo import PPO
 from .ppo2 import PPO2
 from .ppo_ipg import PPO_IPG
+from .linked_ipg import LinkedIPG
 from .SE import StateEstimator
diff --git a/learning/algorithms/linked_ipg.py b/learning/algorithms/linked_ipg.py
new file mode 100644
index 00000000..de024276
--- /dev/null
+++ b/learning/algorithms/linked_ipg.py
@@ -0,0 +1,316 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from learning.utils import (
+    create_uniform_generator,
+    compute_generalized_advantages,
+    normalize,
+    polyak_update,
+)
+
+
+class LinkedIPG:
+    def __init__(
+        self,
+        actor,
+        critic_v,
+        critic_q,
+        target_critic_q,
+        batch_size=2**15,
+        max_gradient_steps=10,
+        clip_param=0.2,
+        gamma=0.998,
+        lam=0.95,
+        entropy_coef=0.0,
+        learning_rate=1e-3,
+        max_grad_norm=1.0,
+        use_clipped_value_loss=True,
+        schedule="fixed",
+        desired_kl=0.01,
+        polyak=0.995,
+        use_cv=False,
+        inter_nu=0.2,
+        beta="off_policy",
+        device="cpu",
+        lr_range=[1e-4, 1e-2],
+        lr_ratio=1.3,
+        val_interpolation=0.5,
+        **kwargs,
+    ):
+        self.device = device
+
+        self.desired_kl = desired_kl
+        self.schedule = schedule
+        self.learning_rate = learning_rate
+        self.lr_range = lr_range
+        self.lr_ratio = lr_ratio
+
+        # * PPO components
+        self.actor = actor.to(self.device)
+        self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate)
+        self.critic_v = critic_v.to(self.device)
+        self.critic_v_optimizer = optim.Adam(
+            self.critic_v.parameters(), lr=learning_rate
+        )
+
+        # * IPG components
+        self.critic_q = critic_q.to(self.device)
+        self.critic_q_optimizer = optim.Adam(
+            self.critic_q.parameters(), lr=learning_rate
+        )
+        self.target_critic_q = target_critic_q.to(self.device)
+        self.target_critic_q.load_state_dict(self.critic_q.state_dict())
+
+        # * PPO parameters
+        self.clip_param = clip_param
+        self.batch_size = batch_size
+        self.max_gradient_steps = max_gradient_steps
+        self.entropy_coef = entropy_coef
+        self.gamma = gamma
+        self.lam = lam
+        self.max_grad_norm = max_grad_norm
+        self.use_clipped_value_loss = use_clipped_value_loss
+
+        # * IPG parameters
+        self.polyak = polyak
+        self.use_cv = use_cv
+        self.inter_nu = inter_nu
+        self.beta = beta
+        self.val_interpolation = val_interpolation
+
+    def switch_to_train(self):
+        self.actor.train()
+        self.critic_v.train()
+        self.critic_q.train()
+
+    def act(self, obs):
+        return self.actor.act(obs).detach()
+
+    def update(self, data_onpol, data_offpol):
+        # On-policy GAE
+        data_onpol["values"] = self.critic_v.evaluate(data_onpol["critic_obs"])
+        data_onpol["advantages"] = compute_generalized_advantages(
+            data_onpol, self.gamma, self.lam, self.critic_v
+        )
+        data_onpol["returns"] = data_onpol["advantages"] + data_onpol["values"]
+        data_onpol["advantages"] = normalize(data_onpol["advantages"])
+
+        data_offpol["values"] = self.critic_v.evaluate(data_offpol["critic_obs"])
+        data_offpol["advantages"] = compute_generalized_advantages(
+            data_offpol, self.gamma, self.lam, self.critic_v
+        )
+        data_offpol["returns"] = data_offpol["advantages"] + data_offpol["values"]
+
+        self.update_critic_v(data_offpol)
+        self.update_critic_q(data_offpol)
+        self.update_actor(data_onpol, data_offpol)
+
+    # def update_joint_critics(self, data_onpol, data_offpol):
+    #     self.mean_q_loss = 0
+    #     self.mean_value_loss = 0
+    #     counter = 0
+    #     generator_onpol = create_uniform_generator(
+    #         data_onpol,
+    #         self.batch_size,
+    #         max_gradient_steps=self.max_gradient_steps,
+    #     )
+    #     generator_offpol = create_uniform_generator(
+    #         data_offpol,
+    #         self.batch_size,
+    #         max_gradient_steps=self.max_gradient_steps,
+    #     )
+    #     for batch_onpol, batch_offpol in zip(generator_onpol, generator_offpol):
+    #         with torch.no_grad():
+    #             action_next_onpol = self.actor.act_inference(
+    #                 batch_onpol["next_actor_obs"]
+    #             )
+    #             q_input_next_onpol = torch.cat(
+    #                 batch_onpol["next_critic_obs"], action_next_onpol
+    #             )
+    #             action_next_offpol = self.actor.act_inference(
+    #                 batch_offpol["next_actor_obs"]
+    #             )
+    #             q_input_next_offpol = torch.cat(
+    #                 batch_offpol["next_critic_obs"], action_next_offpol
+    #             )
+    #             q_value_offpol = self.critic_q.evaluate(q_input_next_offpol)
+
+    #         loss_V_returns = self.critic_v.loss_fn(
+    #             batch_onpol["critic_obs"], batch_onpol["returns"]
+    #         )
+    # loss_V_Q = nn.functional.mse_loss(
+    #     self.critic_v.evaluate(batch_onpol["critic_obs"]),
+    #     self.critic_q.evaluate(batch_onpol["critic_obs"]),
+    #     reduction="mean")
+    #
+    #         with torch.no_grad():
+    #             action_next = self.actor.act_inference(batch_offpol["next_actor_obs"])
+    #             q_input_next = torch.cat(
+    #                 (batch_offpol["next_critic_obs"], action_next), dim=-1
+    #             )
+
+    def update_critic_q(self, data):
+        self.mean_q_loss = 0
+        counter = 0
+
+        generator = create_uniform_generator(
+            data,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
+        for batch in generator:
+            with torch.no_grad():
+                action_next = self.actor.act_inference(batch["next_actor_obs"])
+                q_input_next = torch.cat(
+                    (batch["next_critic_obs"], action_next), dim=-1
+                )
+                q_next = self.target_critic_q.evaluate(q_input_next)
+                v_next = self.critic_v.evaluate(batch["next_critic_obs"])
+                q_target = batch["rewards"] + batch["dones"].logical_not() * (
+                    self.gamma
+                    * (
+                        q_next * self.val_interpolation
+                        + v_next * (1 - self.val_interpolation)
+                    )
+                )
+            q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1)
+            q_loss = self.critic_q.loss_fn(q_input, q_target)
+            self.critic_q_optimizer.zero_grad()
+            q_loss.backward()
+            nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm)
+            self.critic_q_optimizer.step()
+            self.mean_q_loss += q_loss.item()
+            counter += 1
+
+            # TODO: check where to do polyak update (IPG repo does it here)
+            self.target_critic_q = polyak_update(
+                self.critic_q, self.target_critic_q, self.polyak
+            )
+        self.mean_q_loss /= counter
+
+    def update_critic_v(self, data):
+        self.mean_value_loss = 0
+        counter = 0
+
+        generator = create_uniform_generator(
+            data,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
+        for batch in generator:
+            value_loss = self.critic_v.loss_fn(batch["critic_obs"], batch["returns"])
+            self.critic_v_optimizer.zero_grad()
+            value_loss.backward()
+            nn.utils.clip_grad_norm_(self.critic_v.parameters(), self.max_grad_norm)
+            self.critic_v_optimizer.step()
+            self.mean_value_loss += value_loss.item()
+            counter += 1
+        self.mean_value_loss /= counter
+
+    def update_actor(self, data_onpol, data_offpol):
+        self.mean_surrogate_loss = 0
+        self.mean_offpol_loss = 0
+        counter = 0
+
+        self.actor.update_distribution(data_onpol["actor_obs"])
+        data_onpol["old_sigma"] = self.actor.action_std.detach()
+        data_onpol["old_mu"] = self.actor.action_mean.detach()
+        data_onpol["old_actions_log_prob"] = self.actor.get_actions_log_prob(
+            data_onpol["actions"]
+        ).detach()
+
+        # Generate off-policy batches and use all on-policy data
+        generator_offpol = create_uniform_generator(
+            data_offpol,
+            self.batch_size,
+            max_gradient_steps=self.max_gradient_steps,
+        )
+        for batch_offpol in generator_offpol:
+            self.actor.update_distribution(data_onpol["actor_obs"])
+            actions_log_prob_onpol = self.actor.get_actions_log_prob(
+                data_onpol["actions"]
+            )
+            mu_onpol = self.actor.action_mean
+            sigma_onpol = self.actor.action_std
+
+            # * KL
+            if self.desired_kl is not None and self.schedule == "adaptive":
+                with torch.inference_mode():
+                    kl = torch.sum(
+                        torch.log(sigma_onpol / data_onpol["old_sigma"] + 1.0e-5)
+                        + (
+                            torch.square(data_onpol["old_sigma"])
+                            + torch.square(data_onpol["old_mu"] - mu_onpol)
+                        )
+                        / (2.0 * torch.square(sigma_onpol))
+                        - 0.5,
+                        axis=-1,
+                    )
+                    kl_mean = torch.mean(kl)
+                    lr_min, lr_max = self.lr_range
+
+                    if kl_mean > self.desired_kl * 2.0:
+                        self.learning_rate = max(
+                            lr_min, self.learning_rate / self.lr_ratio
+                        )
+                    elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
+                        self.learning_rate = min(
+                            lr_max, self.learning_rate * self.lr_ratio
+                        )
+
+                    for param_group in self.optimizer.param_groups:
+                        # ! check this
+                        param_group["lr"] = self.learning_rate
+
+            # * On-policy surrogate loss
+            adv_onpol = data_onpol["advantages"]
+            if self.use_cv:
+                # TODO: control variate
+                critic_based_adv = 0  # get_control_variate(data_onpol, self.critic_v)
+                learning_signals = (adv_onpol - critic_based_adv) * (1 - self.inter_nu)
+            else:
+                learning_signals = adv_onpol * (1 - self.inter_nu)
+
+            ratio = torch.exp(
+                actions_log_prob_onpol - data_onpol["old_actions_log_prob"]
+            )
+            ratio_clipped = torch.clamp(
+                ratio, 1.0 - self.clip_param, 1.0 + self.clip_param
+            )
+            surrogate = -learning_signals * ratio
+            surrogate_clipped = -learning_signals * ratio_clipped
+            loss_onpol = torch.max(surrogate, surrogate_clipped).mean()
+
+            # * Off-policy loss
+            if self.beta == "on_policy":
+                loss_offpol = self.compute_loss_offpol(data_onpol)
+            elif self.beta == "off_policy":
+                loss_offpol = self.compute_loss_offpol(batch_offpol)
+            else:
+                raise ValueError(f"Invalid beta value: {self.beta}")
+
+            if self.use_cv:
+                b = 1
+            else:
+                b = self.inter_nu
+
+            loss = loss_onpol + b * loss_offpol
+
+            # * Gradient step
+            self.optimizer.zero_grad()
+            loss.backward()
+            nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
+            self.optimizer.step()
+            self.mean_surrogate_loss += loss_onpol.item()
+            self.mean_offpol_loss += b * loss_offpol.item()
+            counter += 1
+        self.mean_surrogate_loss /= counter
+        self.mean_offpol_loss /= counter
+
+    def compute_loss_offpol(self, data):
+        obs = data["actor_obs"]
+        actions = self.actor.act_inference(obs)
+        q_input = torch.cat((data["critic_obs"], actions), dim=-1)
+        q_value = self.critic_q.evaluate(q_input)
+        return -q_value.mean()
diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py
index 0dd7907f..0738b86d 100644
--- a/learning/algorithms/ppo_ipg.py
+++ b/learning/algorithms/ppo_ipg.py
@@ -95,17 +95,8 @@ def update(self, data_onpol, data_offpol):
         data_onpol["advantages"] = normalize(data_onpol["advantages"])
 
         self.update_critic_q(data_offpol)
-
-        # Freeze Q-network params
-        for param in self.critic_q.parameters():
-            param.requires_grad = False
-
-        self.update_actor(data_onpol, data_offpol)
         self.update_critic_v(data_onpol)
-
-        # Un-freeze Q-network params
-        for param in self.critic_q.parameters():
-            param.requires_grad = True
+        self.update_actor(data_onpol, data_offpol)
 
     def update_critic_q(self, data):
         self.mean_q_loss = 0
@@ -129,6 +120,7 @@ def update_critic_q(self, data):
                 )
             q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1)
             q_loss = self.critic_q.loss_fn(q_input, q_target)
+            print(q_loss.item())
             self.critic_q_optimizer.zero_grad()
             q_loss.backward()
             nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm)
diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py
index bc8a0eeb..d8b37756 100644
--- a/learning/runners/finetune_runner.py
+++ b/learning/runners/finetune_runner.py
@@ -20,19 +20,27 @@ def __init__(
         train_cfg,
         log_dir,
         data_list,
-        data_length=2990,
+        data_length=1500,
         data_name="SMOOTH_RL_CONTROLLER",
+        se_path=None,
+        use_simulator=True,
         exploration_scale=1.0,
         device="cpu",
     ):
-        # Init calls _set_up_alg which needs exploration scale
-        self.exploration_scale = exploration_scale
-        super().__init__(env, train_cfg, device)
+        # Instead of super init, only set necessary attributes
+        self.env = env
+        self.parse_train_cfg(train_cfg)
+        self.num_steps_per_env = self.cfg["num_steps_per_env"]
 
         self.log_dir = log_dir
         self.data_list = data_list  # Describes structure of Robot-Software logs
         self.data_length = data_length  # Logs must contain at least this many steps
         self.data_name = data_name
+        self.se_path = se_path
+        self.use_simulator = use_simulator
+        self.exploration_scale = exploration_scale
+        self.device = device
+        self._set_up_alg()
 
     def _set_up_alg(self):
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
@@ -48,21 +56,11 @@ def _set_up_alg(self):
         else:
             actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
 
-        if "state_estimator" in self.train_cfg.keys():
-            self.se_cfg = self.train_cfg["state_estimator"]
-            state_estimator_network = StateEstimatorNN(
-                self.get_obs_size(self.se_cfg["obs"]),
-                self.get_obs_size(self.se_cfg["targets"]),
-                **self.se_cfg["network"],
-            )
-            self.SE = StateEstimator(
-                state_estimator_network, device=self.device, **self.se_cfg
-            )
-        else:
-            self.SE = None
+        alg_name = self.cfg["algorithm_class_name"]
+        alg_class = eval(alg_name)
+        self.ipg = alg_name in ["PPO_IPG", "LinkedIPG"]
 
-        alg_class = eval(self.cfg["algorithm_class_name"])
-        if self.cfg["algorithm_class_name"] == "PPO_IPG":
+        if self.ipg:
             critic_v = Critic(num_critic_obs, **self.critic_cfg)
             critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
             target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg)
@@ -78,6 +76,20 @@ def _set_up_alg(self):
             critic = Critic(num_critic_obs, **self.critic_cfg)
             self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
 
+        if "state_estimator" in self.train_cfg.keys() and self.se_path is not None:
+            self.se_cfg = self.train_cfg["state_estimator"]
+            state_estimator_network = StateEstimatorNN(
+                self.get_obs_size(self.se_cfg["obs"]),
+                self.get_obs_size(self.se_cfg["targets"]),
+                **self.se_cfg["network"],
+            )
+            self.SE = StateEstimator(
+                state_estimator_network, device=self.device, **self.se_cfg
+            )
+            self.load_se(self.se_path)
+        else:
+            self.SE = None
+
     def parse_train_cfg(self, train_cfg):
         self.cfg = train_cfg["runner"]
         self.alg_cfg = train_cfg["algorithm"]
@@ -85,7 +97,10 @@ def parse_train_cfg(self, train_cfg):
         self.critic_cfg = train_cfg["critic"]
         self.train_cfg = train_cfg
 
-    def get_data_dict(self, offpol=False):
+    def get_data_dict(self, offpol=False, load_path=None, save_path=None):
+        # Concatenate data with loaded dict
+        loaded_data_dict = torch.load(load_path) if load_path else None
+
         checkpoint = self.cfg["checkpoint"]
         if offpol:
             # All files up until checkpoint
@@ -147,6 +162,26 @@ def get_data_dict(self, offpol=False):
         data_dict["timed_out"] = torch.zeros(batch_size, device=self.device, dtype=bool)
         data_dict["dones"] = torch.zeros(batch_size, device=self.device, dtype=bool)
 
+        # Concatenate with loaded dict
+        if loaded_data_dict is not None:
+            loaded_batch_size = loaded_data_dict.batch_size
+            assert loaded_batch_size[0] == batch_size[0]
+            new_batch_size = (
+                loaded_batch_size[0],
+                loaded_batch_size[1] + batch_size[1],
+            )
+            data_dict = TensorDict(
+                {
+                    key: torch.cat((loaded_data_dict[key], data_dict[key]), dim=1)
+                    for key in data_dict.keys()
+                },
+                device=self.device,
+                batch_size=new_batch_size,
+            )
+
+        if save_path:
+            torch.save(data_dict, save_path)
+
         return data_dict
 
     def get_data_obs(self, obs_list, data_struct):
@@ -160,8 +195,9 @@ def get_data_obs(self, obs_list, data_struct):
 
         return obs_all.float()
 
-    def get_data_rewards(self, data_struct, reward_weights, dt=0.01):
-        minimalist_cheetah = MinimalistCheetah(device=self.device)
+    def get_data_rewards(self, data_struct, reward_weights):
+        ctrl_dt = 1.0 / self.env.cfg.control.ctrl_frequency
+        minimalist_cheetah = MinimalistCheetah(ctrl_dt=ctrl_dt, device=self.device)
         rewards_dict = {name: [] for name in reward_weights.keys()}  # for plotting
         rewards_all = torch.empty(0).to(self.device)
 
@@ -188,7 +224,7 @@ def get_data_rewards(self, data_struct, reward_weights, dt=0.01):
             minimalist_cheetah.post_process()
 
         rewards_dict["total"] = rewards_all.tolist()
-        rewards_all *= dt  # scaled for alg update
+        rewards_all *= ctrl_dt  # scaled for alg update
 
         return rewards_all.float(), rewards_dict
 
@@ -213,10 +249,9 @@ def update_state_estimates(self):
             )
             idx += dim
 
-    def learn(self):
-        self.alg.switch_to_train()
-
-        if self.env is not None:
+    def load_data(self, load_path=None, save_path=None):
+        # Load on- and off-policy data
+        if self.use_simulator:
             # Simulate on-policy data
             self.data_onpol = TensorDict(
                 self.get_sim_data(),
@@ -226,12 +261,21 @@ def learn(self):
         else:
             self.data_onpol = self.get_data_dict()
 
-        # Single alg update on data
-        if self.cfg["algorithm_class_name"] == "PPO_IPG":
-            self.data_offpol = self.get_data_dict(offpol=True)
-            self.alg.update(self.data_onpol, self.data_offpol)
+        if self.ipg:
+            self.data_offpol = self.get_data_dict(
+                offpol=True, load_path=load_path, save_path=save_path
+            )
         else:
+            self.data_offpol = None
+
+    def learn(self):
+        self.alg.switch_to_train()
+
+        # Single alg update on data
+        if self.data_offpol is None:
             self.alg.update(self.data_onpol)
+        else:
+            self.alg.update(self.data_onpol, self.data_offpol)
 
     def get_sim_data(self):
         rewards_dict = {}
@@ -328,16 +372,16 @@ def update_rewards(self, rewards_dict, terminated):
         )
 
     def save(self, path):
-        if self.cfg["algorithm_class_name"] == "PPO_IPG":
+        if self.ipg:
             torch.save(
                 {
                     "actor_state_dict": self.alg.actor.state_dict(),
                     "critic_v_state_dict": self.alg.critic_v.state_dict(),
                     "critic_q_state_dict": self.alg.critic_q.state_dict(),
+                    "target_critic_q_state_dict": self.alg.target_critic_q.state_dict(),
                     "optimizer_state_dict": self.alg.optimizer.state_dict(),
                     "critic_v_opt_state_dict": self.alg.critic_v_optimizer.state_dict(),
                     "critic_q_opt_state_dict": self.alg.critic_q_optimizer.state_dict(),
-                    "iter": 1,  # only one iteration
                 },
                 path,
             )
@@ -348,7 +392,6 @@ def save(self, path):
                 "critic_state_dict": self.alg.critic.state_dict(),
                 "optimizer_state_dict": self.alg.optimizer.state_dict(),
                 "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(),
-                "iter": 1,  # only one iteration
             },
             path,
         )
@@ -357,9 +400,12 @@ def load(self, path, load_optimizer=True):
         loaded_dict = torch.load(path)
         self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"])
 
-        if self.cfg["algorithm_class_name"] == "PPO_IPG":
+        if self.ipg:
             self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"])
             self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"])
+            self.alg.target_critic_q.load_state_dict(
+                loaded_dict["target_critic_q_state_dict"]
+            )
             if load_optimizer:
                 self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
                 self.alg.critic_v_optimizer.load_state_dict(
diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py
index e46f5665..f5bcee26 100644
--- a/learning/runners/ipg_runner.py
+++ b/learning/runners/ipg_runner.py
@@ -27,8 +27,8 @@ def __init__(self, env, train_cfg, device="cpu"):
 
     def _set_up_alg(self):
         alg_class_name = self.cfg["algorithm_class_name"]
-        if alg_class_name != "PPO_IPG":
-            raise ValueError("IPGRunner only supports PPO_IPG")
+        if alg_class_name not in ["PPO_IPG", "LinkedIPG"]:
+            raise ValueError("IPGRunner only supports PPO_IPG or Linked_IPG")
 
         alg_class = eval(alg_class_name)
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
@@ -164,7 +164,7 @@ def learn(self):
             logger.toc("learning")
 
             if self.it % self.save_interval == 0:
-                self.save()
+                self.save(save_storage=False)
             storage_onpol.clear()  # only clear on-policy storage
 
             logger.log_all_categories()
@@ -173,7 +173,7 @@ def learn(self):
             logger.toc("runtime")
             logger.print_to_terminal()
 
-        self.save()
+        # self.save()
 
     @torch.no_grad
     def burn_in_normalization(self, n_iterations=100):
@@ -240,6 +240,7 @@ def save(self, save_storage=False):
                 "actor_state_dict": self.alg.actor.state_dict(),
                 "critic_v_state_dict": self.alg.critic_v.state_dict(),
                 "critic_q_state_dict": self.alg.critic_q.state_dict(),
+                "target_critic_q_state_dict": self.alg.target_critic_q.state_dict(),
                 "optimizer_state_dict": self.alg.optimizer.state_dict(),
                 "critic_v_opt_state_dict": self.alg.critic_v_optimizer.state_dict(),
                 "critic_q_opt_state_dict": self.alg.critic_q_optimizer.state_dict(),
@@ -263,6 +264,9 @@ def load(self, path, load_optimizer=True, load_actor_std=True):
             self.alg.actor.std = nn.Parameter(std_init)
         self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"])
         self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"])
+        self.alg.target_critic_q.load_state_dict(
+            loaded_dict["target_critic_q_state_dict"]
+        )
         if load_optimizer:
             self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
             self.alg.critic_v_optimizer.load_state_dict(
diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py
index 87f60071..340c3cf9 100644
--- a/scripts/eval_rewards.py
+++ b/scripts/eval_rewards.py
@@ -13,15 +13,16 @@
 import matplotlib.pyplot as plt
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
-SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt"  # if None: no SE
-LOAD_RUN = "Jul23_00-14-23_nu02_B8"
+# SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt"  # if None: no SE
+SE_PATH = None
+LOAD_RUN = "Jul25_16-06-36_LinkedIPG_50Hz_nu02_v08"
 
 REWARDS_FILE = None  # generate this file from logs, if None: just plot
+
 PLOT_REWARDS = {
-    "Nu=0.0": "rewards_nu0.csv",
-    "Nu=0.5": "rewards_nu05.csv",
-    "Nu=0.9": "rewards_nu09.csv",
-    "Nu=0.9 new": "rewards_nu09_new.csv",
+    "Nu=0.5 sim onpol concat": "rewards_nu05_sim_concat.csv",
+    "Nu=0.5 sim onpol": "rewards_nu05_sim_onpol.csv",
+    "Nu=0.5 real onpol": "rewards_nu05_real_onpol.csv",
 }
 
 # Data struct fields from Robot-Software logs
@@ -37,7 +38,8 @@
     "phase_obs",  # 8
     "grf",  # 9
     "dof_pos_target",  # 10
-    "exploration_noise",  # 11
+    "torques",  # 11
+    "exploration_noise",  # 12
     "footer",
 ]
 
@@ -68,7 +70,10 @@ def update_rewards_df(runner):
         iteration = int(file.split(".")[0])
         path = os.path.join(ROOT_DIR, LOAD_RUN, file)
         data = scipy.io.loadmat(path)
-        data_dict[iteration] = data[runner.data_name][0][0]
+        runner.data_struct = data[runner.data_name][0][0]
+        if runner.SE:
+            runner.update_state_estimates()
+        data_dict[iteration] = runner.data_struct
 
     rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE)
     if os.path.exists(rewards_path):
@@ -149,6 +154,7 @@ def setup():
         log_dir,
         data_list=DATA_LIST,
         device=DEVICE,
+        se_path=SE_PATH,
     )
 
     return runner
@@ -157,8 +163,6 @@ def setup():
 if __name__ == "__main__":
     if REWARDS_FILE is not None:
         runner = setup()
-        if SE_PATH is not None:
-            runner.load_se(SE_PATH)
         update_rewards_df(runner)
 
     # Plot rewards stats
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 6247a466..64dac794 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -9,13 +9,20 @@
 import os
 import torch
 import numpy as np
+import pandas as pd
 
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
-SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt"  # if None: no SE
+# SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt"  # if None: no SE
+SE_PATH = None
 OUTPUT_FILE = "output.txt"
+LOSSES_FILE = "losses.csv"
 
 USE_SIMULATOR = True
 
+# Load/save off-policy storage, this can contain many runs
+OFFPOL_LOAD_FILE = "offpol_data_10.pt"
+OFFPOL_SAVE_FILE = None
+
 # Scales
 EXPLORATION_SCALE = 0.5  # used during data collection
 ACTION_SCALES = np.tile(np.array([0.2, 0.3, 0.3]), 4)
@@ -56,6 +63,8 @@ def setup():
         train_cfg,
         log_dir,
         data_list=DATA_LIST,
+        se_path=SE_PATH,
+        use_simulator=USE_SIMULATOR,
         exploration_scale=EXPLORATION_SCALE,
         device=DEVICE,
     )
@@ -64,20 +73,31 @@ def setup():
 
 
 def finetune(runner):
+    # Load model
     load_run = runner.cfg["load_run"]
     checkpoint = runner.cfg["checkpoint"]
-    load_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt")
-    runner.load(load_path)
+    model_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt")
+    runner.load(model_path)
 
-    if SE_PATH is not None:
-        runner.load_se(SE_PATH)
+    # Load data
+    load_path = (
+        os.path.join(ROOT_DIR, load_run, OFFPOL_LOAD_FILE) if OFFPOL_LOAD_FILE else None
+    )
+    save_path = (
+        os.path.join(ROOT_DIR, load_run, OFFPOL_SAVE_FILE) if OFFPOL_SAVE_FILE else None
+    )
+    runner.load_data(load_path=load_path, save_path=save_path)
+
+    # Get old inference actions
+    action_scales = torch.tensor(ACTION_SCALES).to(DEVICE)
+    actions_old = action_scales * runner.alg.actor.act_inference(
+        runner.data_onpol["actor_obs"]
+    )
 
     # Perform a single update
     runner.learn()
 
-    # Compare old and new actions
-    actions_old = runner.data_onpol["actions"]
-    action_scales = torch.tensor(ACTION_SCALES).to(DEVICE)
+    # Compare old to new actions
     actions_new = action_scales * runner.alg.actor.act_inference(
         runner.data_onpol["actor_obs"]
     )
@@ -92,6 +112,7 @@ def finetune(runner):
     # Print to output file
     with open(os.path.join(ROOT_DIR, load_run, OUTPUT_FILE), "a") as f:
         f.write(f"############ Checkpoint: {checkpoint} #######################\n")
+        f.write(f"############## Nu={runner.alg.inter_nu} ###################\n")
         f.write("############### DATA ###############\n")
         f.write(f"Data on-policy shape: {runner.data_onpol.shape}\n")
         if runner.data_offpol is not None:
@@ -107,6 +128,38 @@ def finetune(runner):
         f.write(f"Std action diff per actuator: {diff.std(dim=(0, 1))}\n")
         f.write(f"Overall mean action diff: {diff.mean()}\n")
 
+    # Log losses to csv
+    losses_path = os.path.join(ROOT_DIR, load_run, LOSSES_FILE)
+    if not os.path.exists(losses_path):
+        if runner.data_offpol is None:
+            losses_df = pd.DataFrame(
+                columns=["checkpoint", "value_loss", "surrogate_loss"]
+            )
+        else:
+            losses_df = pd.DataFrame(
+                columns=[
+                    "checkpoint",
+                    "value_loss",
+                    "q_loss",
+                    "surrogate_loss",
+                    "offpol_loss",
+                ]
+            )
+    else:
+        losses_df = pd.read_csv(losses_path)
+
+    append_data = {
+        "checkpoint": checkpoint,
+        "value_loss": runner.alg.mean_value_loss,
+        "surrogate_loss": runner.alg.mean_surrogate_loss,
+    }
+    if runner.data_offpol is not None:
+        append_data["q_loss"] = runner.alg.mean_q_loss
+        append_data["offpol_loss"] = runner.alg.mean_offpol_loss
+
+    losses_df = losses_df._append(append_data, ignore_index=True)
+    losses_df.to_csv(losses_path, index=False)
+
 
 if __name__ == "__main__":
     runner = setup()
diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh
index 95e84728..b232a36a 100644
--- a/scripts/finetune_multiple.sh
+++ b/scripts/finetune_multiple.sh
@@ -2,10 +2,11 @@
 source /home/lmolnar/miniconda3/etc/profile.d/conda.sh
 
 # Args
-LOAD_RUN="Jul23_00-14-23_nu02_B8"
+LOAD_RUN="Jul25_16-06-36_LinkedIPG_50Hz_nu02_v08"
 CHECKPOINT=1000
-N_RUNS=6
-EVAL=false  # make sure to turn exploration on/off in RS
+N_RUNS=10
+INTER_NU=0.9  # can be fixed or adaptive
+EVAL=false  # no finetuning, just evaluate without exploration (set in RS)
 
 # Set directories
 QGYM_DIR="/home/lmolnar/workspace/QGym"
@@ -19,11 +20,11 @@ cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_DIR}/config/systems/quadruped/c
 
 for i in $(seq 1 $N_RUNS)
 do
-    # Store LCM logs in file labeled with checkpoint
-    FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT}
-    if [ -f "$FILE" ]; then
-        rm "$FILE"
-    fi
+    # Store logs in files labeled by checkpoint
+    LCM_FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT}
+    MAT_FILE=${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat
+    rm ${LCM_FILE}
+    rm ${MAT_FILE}
 
     # Run logging script in background
     ${RS_DIR}/logging/scripts/run_lcm_logger.sh ${CHECKPOINT} &
@@ -40,10 +41,11 @@ do
     cp ${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat ${QGYM_LOG_DIR}
 
     # Finetune in QGym
+    # INTER_NU=$(echo "0.05 * $i" | bc)  # adaptive
     if [ "$EVAL" = false ] ; then
         conda deactivate
         conda activate qgym
-        python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT}
+        python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT} --inter_nu=${INTER_NU}
     fi
 
     # Copy policy to RS
diff --git a/scripts/visualize_ipg.py b/scripts/visualize_ipg.py
index 993f4e0d..95b03cad 100644
--- a/scripts/visualize_ipg.py
+++ b/scripts/visualize_ipg.py
@@ -17,8 +17,8 @@
 DEVICE = "cpu"
 
 # * Setup
-LOAD_RUN = "Jul17_17-22-37_IPG_nu1"
-TITLE = "IPG nu=1.0"
+LOAD_RUN = "Jul25_11-28-41_LinkedIPG_nu05_v05"
+TITLE = "LinkedIPG nu=0.5 blend=0.5 polyak=0.9"
 IT_RANGE = range(20, 101, 20)
 
 RUN_DIR = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "pendulum", LOAD_RUN)
@@ -53,7 +53,7 @@
 
 for it in IT_RANGE:
     # load data
-    base_data = torch.load(os.path.join(RUN_DIR, "data_onpol{}.pt".format(it))).to(
+    base_data = torch.load(os.path.join(RUN_DIR, "data_onpol_{}.pt".format(it))).to(
         DEVICE
     )
     data = base_data.detach().clone()

From 261a4d56880ecfee39f23921871d9643ef4e9a4d Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Sat, 27 Jul 2024 09:19:27 -0400
Subject: [PATCH 42/42] cleanup

---
 .../mini_cheetah_finetune_config.py           | 13 ++---
 .../mini_cheetah/mini_cheetah_ref_config.py   | 16 +++++--
 gym/envs/mini_cheetah/minimalist_cheetah.py   | 11 +++--
 learning/algorithms/linked_ipg.py             | 15 +++---
 learning/algorithms/ppo_ipg.py                | 10 +++-
 learning/runners/finetune_runner.py           | 18 +++++--
 learning/runners/ipg_runner.py                |  9 ++--
 scripts/eval_rewards.py                       | 15 +++---
 scripts/finetune.py                           | 20 ++++----
 scripts/finetune_multiple.sh                  |  5 +-
 scripts/generate_commands.py                  | 47 +++++++++++++------
 scripts/plot_losses.py                        | 30 ++++++++++++
 12 files changed, 146 insertions(+), 63 deletions(-)
 create mode 100644 scripts/plot_losses.py

diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
index f4fd30a8..31f41894 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py
@@ -104,7 +104,7 @@ class weights:
                 tracking_ang_vel = 2.0
                 orientation = 1.0
                 min_base_height = 1.5
-                stand_still = 2.0
+                stand_still = 1.0
                 swing_grf = 3.0
                 stance_grf = 3.0
                 action_rate = 0.01
@@ -138,13 +138,14 @@ class algorithm(MiniCheetahRefRunnerCfg.algorithm):
         use_cv = False
         inter_nu = 0.9
         beta = "off_policy"
-        storage_size = 30000
+        storage_size = 30_000
+        # val_interpolation = 0.8
 
         # Finetuning
         clip_param = 0.2
-        max_gradient_steps = 4
-        batch_size = 30000
-        learning_rate = 1e-4
+        max_gradient_steps = 8
+        batch_size = 30_000
+        learning_rate = 5e-5  # ACTOR
         schedule = "fixed"
 
     class runner(MiniCheetahRefRunnerCfg.runner):
@@ -156,6 +157,6 @@ class runner(MiniCheetahRefRunnerCfg.runner):
 
         # Finetuning
         resume = True
-        load_run = "Jul23_00-14-23_nu02_B8"
+        load_run = "Jul24_22-48-41_nu05_B8"
         checkpoint = 1000
         save_interval = 1
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 34c20416..2e808719 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -146,15 +146,23 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         desired_kl = 0.02  # 0.02 for smooth-exploration, else 0.01
 
         # IPG
-        polyak = 0.995
+        polyak = 0.95
         use_cv = False
         inter_nu = 0.2
         beta = "off_policy"
-        storage_size = 8 * 32 * 4096  # num_policies*num_stpes*num_envs
+        storage_size = 8 * 32 * 4096  # num_policies*num_steps*num_envs
+        val_interpolation = 0.8  # 0: use V(s'), 1: use Q(s', pi(s'))
+        learning_rate = 1.0e-3
+        lr_range = [1e-5, 1e-2]
+        lr_ratio = 1.5
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
-        max_iterations = 1000  # number of policy updates
-        algorithm_class_name = "PPO_IPG"
+        max_iterations = 800  # number of policy updates
+        algorithm_class_name = "LinkedIPG"
         num_steps_per_env = 32
+
+        # resume = False
+        # load_run = "Jul25_12-24-16_LinkedIPG_50Hz_nu02_v08"
+        # checkpoint = 1000
diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py
index 4702f6ac..8052697d 100644
--- a/gym/envs/mini_cheetah/minimalist_cheetah.py
+++ b/gym/envs/mini_cheetah/minimalist_cheetah.py
@@ -60,14 +60,15 @@ def set_states(
         )
         self.grf = torch.tensor(grf, device=self.device).unsqueeze(0)
 
-        # Compute phase sin
+        # Get phase sin
         phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0)
         self.phase_sin = phase_obs[:, 0].unsqueeze(0)
 
-        # Set targets, these are scaled in Robot-Software
-        self.dof_pos_target = torch.tensor(
-            dof_pos_target, device=self.device
-        ).unsqueeze(0)
+        # Set target history
+        self.dof_pos_target = (
+            torch.tensor(dof_pos_target, device=self.device).unsqueeze(0)
+            * self.dof_pos_scales
+        )
         if self.dof_target_prev is None:
             self.dof_target_prev = self.dof_pos_target
         if self.dof_target_prev2 is None:
diff --git a/learning/algorithms/linked_ipg.py b/learning/algorithms/linked_ipg.py
index de024276..dbaef5eb 100644
--- a/learning/algorithms/linked_ipg.py
+++ b/learning/algorithms/linked_ipg.py
@@ -96,13 +96,14 @@ def update(self, data_onpol, data_offpol):
         data_onpol["returns"] = data_onpol["advantages"] + data_onpol["values"]
         data_onpol["advantages"] = normalize(data_onpol["advantages"])
 
-        data_offpol["values"] = self.critic_v.evaluate(data_offpol["critic_obs"])
-        data_offpol["advantages"] = compute_generalized_advantages(
-            data_offpol, self.gamma, self.lam, self.critic_v
-        )
-        data_offpol["returns"] = data_offpol["advantages"] + data_offpol["values"]
-
-        self.update_critic_v(data_offpol)
+        # TODO: Possibly use off-policy GAE for V-critic update
+        # data_offpol["values"] = self.critic_v.evaluate(data_offpol["critic_obs"])
+        # data_offpol["advantages"] = compute_generalized_advantages(
+        #     data_offpol, self.gamma, self.lam, self.critic_v
+        # )
+        # data_offpol["returns"] = data_offpol["advantages"] + data_offpol["values"]
+
+        self.update_critic_v(data_onpol)
         self.update_critic_q(data_offpol)
         self.update_actor(data_onpol, data_offpol)
 
diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py
index 0738b86d..c3f1c5be 100644
--- a/learning/algorithms/ppo_ipg.py
+++ b/learning/algorithms/ppo_ipg.py
@@ -94,8 +94,15 @@ def update(self, data_onpol, data_offpol):
         data_onpol["returns"] = data_onpol["advantages"] + data_onpol["values"]
         data_onpol["advantages"] = normalize(data_onpol["advantages"])
 
-        self.update_critic_q(data_offpol)
+        # TODO: Possibly use off-policy GAE for V-critic update
+        # data_offpol["values"] = self.critic_v.evaluate(data_offpol["critic_obs"])
+        # data_offpol["advantages"] = compute_generalized_advantages(
+        #     data_offpol, self.gamma, self.lam, self.critic_v
+        # )
+        # data_offpol["returns"] = data_offpol["advantages"] + data_offpol["values"]
+
         self.update_critic_v(data_onpol)
+        self.update_critic_q(data_offpol)
         self.update_actor(data_onpol, data_offpol)
 
     def update_critic_q(self, data):
@@ -120,7 +127,6 @@ def update_critic_q(self, data):
                 )
             q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1)
             q_loss = self.critic_q.loss_fn(q_input, q_target)
-            print(q_loss.item())
             self.critic_q_optimizer.zero_grad()
             q_loss.backward()
             nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm)
diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py
index d8b37756..c4a33f64 100644
--- a/learning/runners/finetune_runner.py
+++ b/learning/runners/finetune_runner.py
@@ -20,7 +20,7 @@ def __init__(
         train_cfg,
         log_dir,
         data_list,
-        data_length=1500,
+        data_length=3000,
         data_name="SMOOTH_RL_CONTROLLER",
         se_path=None,
         use_simulator=True,
@@ -271,6 +271,10 @@ def load_data(self, load_path=None, save_path=None):
     def learn(self):
         self.alg.switch_to_train()
 
+        # Set fixed actor LR from config
+        for param_group in self.alg.optimizer.param_groups:
+            param_group["lr"] = self.alg_cfg["learning_rate"]
+
         # Single alg update on data
         if self.data_offpol is None:
             self.alg.update(self.data_onpol)
@@ -403,9 +407,15 @@ def load(self, path, load_optimizer=True):
         if self.ipg:
             self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"])
             self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"])
-            self.alg.target_critic_q.load_state_dict(
-                loaded_dict["target_critic_q_state_dict"]
-            )
+            # TODO: Possibly always load target with critic_q_state_dict
+            try:
+                self.alg.target_critic_q.load_state_dict(
+                    loaded_dict["target_critic_q_state_dict"]
+                )
+            except:
+                self.alg.target_critic_q.load_state_dict(
+                    loaded_dict["critic_q_state_dict"]
+                )
             if load_optimizer:
                 self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
                 self.alg.critic_v_optimizer.load_state_dict(
diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py
index f5bcee26..252f6098 100644
--- a/learning/runners/ipg_runner.py
+++ b/learning/runners/ipg_runner.py
@@ -264,9 +264,12 @@ def load(self, path, load_optimizer=True, load_actor_std=True):
             self.alg.actor.std = nn.Parameter(std_init)
         self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"])
         self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"])
-        self.alg.target_critic_q.load_state_dict(
-            loaded_dict["target_critic_q_state_dict"]
-        )
+        try:
+            self.alg.target_critic_q.load_state_dict(
+                loaded_dict["target_critic_q_state_dict"]
+            )
+        except:
+            self.alg.target_critic_q.load_state_dict(loaded_dict["critic_q_state_dict"])
         if load_optimizer:
             self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"])
             self.alg.critic_v_optimizer.load_state_dict(
diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py
index 340c3cf9..1c527e5f 100644
--- a/scripts/eval_rewards.py
+++ b/scripts/eval_rewards.py
@@ -15,14 +15,17 @@
 ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
 # SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt"  # if None: no SE
 SE_PATH = None
-LOAD_RUN = "Jul25_16-06-36_LinkedIPG_50Hz_nu02_v08"
+LOAD_RUN = "Jul24_22-48-41_nu05_B8"
 
-REWARDS_FILE = None  # generate this file from logs, if None: just plot
+REWARDS_FILE = (
+    "rewards_nu09_nosim2.csv"  # generate this file from logs, if None: just plot
+)
 
 PLOT_REWARDS = {
-    "Nu=0.5 sim onpol concat": "rewards_nu05_sim_concat.csv",
-    "Nu=0.5 sim onpol": "rewards_nu05_sim_onpol.csv",
-    "Nu=0.5 real onpol": "rewards_nu05_real_onpol.csv",
+    "Nu=0.5 no sim": "rewards_nu05_nosim.csv",
+    "Nu=0.9 no sim": "rewards_nu09_nosim.csv",
+    "Nu=0.9 no sim 2": "rewards_nu09_nosim2.csv",
+    "Nu=0.95 no sim": "rewards_nu095_nosim.csv",
 }
 
 # Data struct fields from Robot-Software logs
@@ -48,7 +51,7 @@
     "tracking_ang_vel": 2.0,
     "min_base_height": 1.5,
     "orientation": 1.0,
-    "stand_still": 2.0,
+    "stand_still": 1.0,
     "swing_grf": 3.0,
     "stance_grf": 3.0,
     "action_rate": 0.01,
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 64dac794..4db64925 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -17,14 +17,15 @@
 OUTPUT_FILE = "output.txt"
 LOSSES_FILE = "losses.csv"
 
-USE_SIMULATOR = True
+USE_SIMULATOR = False  # For on-policy data
+DATA_LENGTH = 12_000  # Robot-Software logs must contain at least this many steps
 
 # Load/save off-policy storage, this can contain many runs
-OFFPOL_LOAD_FILE = "offpol_data_10.pt"
-OFFPOL_SAVE_FILE = None
+OFFPOL_LOAD_FILE = None  # "offpol_data.pt"
+OFFPOL_SAVE_FILE = None  # "offpol_data.pt"
 
 # Scales
-EXPLORATION_SCALE = 0.5  # used during data collection
+EXPLORATION_SCALE = 0.8  # used during data collection
 ACTION_SCALES = np.tile(np.array([0.2, 0.3, 0.3]), 4)
 
 # Data struct fields from Robot-Software logs
@@ -63,6 +64,7 @@ def setup():
         train_cfg,
         log_dir,
         data_list=DATA_LIST,
+        data_length=DATA_LENGTH,
         se_path=SE_PATH,
         use_simulator=USE_SIMULATOR,
         exploration_scale=EXPLORATION_SCALE,
@@ -131,11 +133,7 @@ def finetune(runner):
     # Log losses to csv
     losses_path = os.path.join(ROOT_DIR, load_run, LOSSES_FILE)
     if not os.path.exists(losses_path):
-        if runner.data_offpol is None:
-            losses_df = pd.DataFrame(
-                columns=["checkpoint", "value_loss", "surrogate_loss"]
-            )
-        else:
+        if runner.ipg:
             losses_df = pd.DataFrame(
                 columns=[
                     "checkpoint",
@@ -145,6 +143,10 @@ def finetune(runner):
                     "offpol_loss",
                 ]
             )
+        else:
+            losses_df = pd.DataFrame(
+                columns=["checkpoint", "value_loss", "surrogate_loss"]
+            )
     else:
         losses_df = pd.read_csv(losses_path)
 
diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh
index b232a36a..e3dd96c1 100644
--- a/scripts/finetune_multiple.sh
+++ b/scripts/finetune_multiple.sh
@@ -2,9 +2,9 @@
 source /home/lmolnar/miniconda3/etc/profile.d/conda.sh
 
 # Args
-LOAD_RUN="Jul25_16-06-36_LinkedIPG_50Hz_nu02_v08"
+LOAD_RUN="Jul24_22-48-41_nu05_B8"
 CHECKPOINT=1000
-N_RUNS=10
+N_RUNS=5
 INTER_NU=0.9  # can be fixed or adaptive
 EVAL=false  # no finetuning, just evaluate without exploration (set in RS)
 
@@ -24,7 +24,6 @@ do
     LCM_FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT}
     MAT_FILE=${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat
     rm ${LCM_FILE}
-    rm ${MAT_FILE}
 
     # Run logging script in background
     ${RS_DIR}/logging/scripts/run_lcm_logger.sh ${CHECKPOINT} &
diff --git a/scripts/generate_commands.py b/scripts/generate_commands.py
index e8ef18a4..9fe21d79 100644
--- a/scripts/generate_commands.py
+++ b/scripts/generate_commands.py
@@ -4,23 +4,42 @@
 # Command ranges during training:
 # x_range = [-2.0, 3.0]  # [m/s]
 # y_range = [-1.0, 1.0]  # [m/s]
-# yaw_range = [-3.0. 3.0]  # [rad/s]
+# yaw_range = [-3.0, 3.0]  # [rad/s]
 
 # Command ranges for finetuning:
-x_range = [-1.0, 1.5]  # [m/s]
-y_range = [-0.5, 0.5]  # [m/s]
-yaw_range = [-1.5, 1.5]  # [rad/s]
+x_range = [-0.67, 1.0]  # [m/s]
+y_range = [-0.33, 0.33]  # [m/s]
+yaw_range = [-2.0, 2.0]  # [rad/s]
 
-# Generate random command sequence
-N = 10
-steps = 300
-commands = np.zeros((N * steps, 3))
+# Generate structured command sequence (fixed lin/ang vel, yaw, some random)
+N = 100
+cmds_zero = np.array([[0, 0, 0]]).repeat(N, axis=0)
+cmds = np.zeros((N, 3))
+for _ in range(4):
+    cmds = np.append(cmds, np.array([[x_range[1], 0, 0]]).repeat(3 * N, axis=0), axis=0)
+    cmds = np.append(cmds, cmds_zero, axis=0)
+    cmds = np.append(cmds, np.array([[x_range[0], 0, 0]]).repeat(3 * N, axis=0), axis=0)
+    cmds = np.append(cmds, cmds_zero, axis=0)
+    cmds = np.append(cmds, np.array([[0, y_range[1], 0]]).repeat(2 * N, axis=0), axis=0)
+    cmds = np.append(cmds, cmds_zero, axis=0)
+    cmds = np.append(cmds, np.array([[0, y_range[0], 0]]).repeat(2 * N, axis=0), axis=0)
+    cmds = np.append(cmds, cmds_zero, axis=0)
+    cmds = np.append(
+        cmds, np.array([[0, 0, yaw_range[1]]]).repeat(2 * N, axis=0), axis=0
+    )
+    cmds = np.append(cmds, cmds_zero, axis=0)
+    cmds = np.append(
+        cmds, np.array([[0, 0, yaw_range[0]]]).repeat(2 * N, axis=0), axis=0
+    )
+    cmds = np.append(cmds, cmds_zero, axis=0)
 
-for i in range(1, N):  # first iteration 0 commands
-    x = random.uniform(x_range[0], x_range[1])
-    y = random.uniform(y_range[0], y_range[1])
-    yaw = random.uniform(yaw_range[0], yaw_range[1])
-    commands[i * steps : (i + 1) * steps] = np.array([x, y, yaw])
+    for i in range(5):
+        x = random.uniform(x_range[0], x_range[1])
+        y = random.uniform(y_range[0], y_range[1])
+        yaw = random.uniform(yaw_range[0], yaw_range[1])
+        cmds = np.append(cmds, np.array([[x, y, yaw]]).repeat(2 * N, axis=0), axis=0)
+
+print(cmds.shape)
 
 # Export to txt
-np.savetxt("commands.txt", commands, fmt="%.3f")
+np.savetxt("commands_long.txt", cmds, fmt="%.3f")
diff --git a/scripts/plot_losses.py b/scripts/plot_losses.py
new file mode 100644
index 00000000..e165b860
--- /dev/null
+++ b/scripts/plot_losses.py
@@ -0,0 +1,30 @@
+from gym import LEGGED_GYM_ROOT_DIR
+
+import os
+import matplotlib.pyplot as plt
+import pandas as pd
+
+ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/"
+LOAD_RUN = "Jul26_11-58-37_LinkedIPG_100Hz_nu02_v08"
+
+PLOT_LOSSES = {
+    "Nu=0.9 sim": "losses_nu09_sim.csv",
+    "Nu=0.9 no sim": "losses_nu09_nosim.csv",
+}
+
+LABELS = ["value_loss", "surrogate_loss", "q_loss", "offpol_loss"]
+
+fig, axs = plt.subplots(2, 2, figsize=(10, 6))
+plt.suptitle(" IPG Finetuning Losses")
+
+for i, label in enumerate(LABELS):
+    for name, file in PLOT_LOSSES.items():
+        data = pd.read_csv(os.path.join(ROOT_DIR, LOAD_RUN, file))
+        axs[i // 2, i % 2].plot(data[label], label=name)
+        axs[i // 2, i % 2].set_title(label)
+        axs[i // 2, i % 2].set_xlabel("Checkpoint")
+        axs[i // 2, i % 2].set_ylabel("Loss")
+        axs[i // 2, i % 2].legend()
+
+plt.tight_layout()
+plt.show()