Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
bb85463
smooth noise sampling and started gSDE
lukasmolnar Feb 28, 2024
243cf0f
adress comments
lukasmolnar Mar 1, 2024
d838dc6
start moving things to SmoothActor
lukasmolnar Mar 1, 2024
aea5d88
moved everything to SmoothActor (it runs)
lukasmolnar Mar 1, 2024
3603205
possibly resample in PPO update
lukasmolnar Mar 1, 2024
bb0a273
learn_features=True and correct sample dim
lukasmolnar Mar 4, 2024
40025c4
adjust sample freq and update plotting
lukasmolnar Mar 4, 2024
41e9dea
Merge branch 'dev' into lm/smooth-exploration
lukasmolnar Mar 11, 2024
c301746
update log_std_init=0.0 and refactor
lukasmolnar Mar 15, 2024
a372a52
log joint data for training and play
lukasmolnar Apr 3, 2024
599b2ca
update logging and plotting
lukasmolnar Apr 6, 2024
4994550
plot FT script
lukasmolnar Apr 19, 2024
0205d99
added sweep config
lukasmolnar Apr 19, 2024
a68d7ce
Merge remote-tracking branch 'origin/dev' into lm/smooth-exploration
lukasmolnar Apr 19, 2024
64b9b14
update on policy and old policy runners, get nans for log_probs
lukasmolnar Apr 24, 2024
91656e9
run 200 iterations before starting training, to burn in normalization
sheim Apr 26, 2024
c0a6d61
Merge pull request #15 from mit-biomimetics/sh/smooth-exploration
lukasmolnar Apr 26, 2024
f74e7fe
update dummy input in export_network
lukasmolnar May 3, 2024
18045e9
good choice of params: sample 16, rollout 32, LR x1.1, des_kl 0.02
lukasmolnar May 3, 2024
adfb078
export latent network and update configs
lukasmolnar May 6, 2024
8ff82ae
export latent net with norm and log get_std
lukasmolnar May 6, 2024
007ea93
export actor std to txt file
lukasmolnar May 6, 2024
1ae4ac0
cleanup branch, remove logging and plotting
lukasmolnar Jul 5, 2024
70da359
PPO, PPO-smooth and IPG all learn (IPG-smooth kind of)
lukasmolnar Jul 8, 2024
d010633
IPG-smooth learns decently (adapt LR)
lukasmolnar Jul 8, 2024
35a65c2
refactor for changes on dev
lukasmolnar Jul 10, 2024
5ce0d6e
finetune using robot-software logs
lukasmolnar Jul 11, 2024
cb75936
handle single env dimension at critic
lukasmolnar Jul 11, 2024
f5829f0
update finetuning: exploration scale, grf rewards, plotting
lukasmolnar Jul 11, 2024
f3a4cc4
added min base height reward
lukasmolnar Jul 11, 2024
fdc028f
reset 10% standstill/pure lin/pure ang vel; set rewards: grf=3, stand…
lukasmolnar Jul 13, 2024
3006c4c
evaluate rewards and store in dataframe, added stand still/action rate
lukasmolnar Jul 15, 2024
27e7568
handle IPG in FintuneRunner, and revert storage to single traj
lukasmolnar Jul 15, 2024
0a40204
collect sim data for finetuning, added finetune config
lukasmolnar Jul 16, 2024
b01b0a3
script to finetune multiple runs with robot-software
lukasmolnar Jul 16, 2024
77f5bd4
bugfix finetune script, update rewards evaluation
lukasmolnar Jul 16, 2024
68fde3e
build off-policy buffer for IPG from RS data
lukasmolnar Jul 17, 2024
5e56d54
remove rewards logging in finetune.py
lukasmolnar Jul 17, 2024
1c70ce5
merge lm/SAC_plotting branch, visualize PPO and IPG critics
lukasmolnar Jul 17, 2024
43dec75
finetune IPG with sim on-policy and hardware off-policy, rewards incr…
lukasmolnar Jul 19, 2024
b004dbb
update reward plots
lukasmolnar Jul 19, 2024
36c1c8d
integrate SE for finetuning, refactor runner
lukasmolnar Jul 23, 2024
edf3167
bugfix scale action noise, added scaling and refactorings
lukasmolnar Jul 23, 2024
276f84b
merged LinkedIPG, bugfix target network save, refactorings
lukasmolnar Jul 26, 2024
261a4d5
cleanup
lukasmolnar Jul 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions gym/envs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"MiniCheetahCfg": ".mini_cheetah.mini_cheetah_config",
"MiniCheetahRefCfg": ".mini_cheetah.mini_cheetah_ref_config",
"MiniCheetahOscCfg": ".mini_cheetah.mini_cheetah_osc_config",
"MiniCheetahFineTuneCfg": ".mini_cheetah.mini_cheetah_finetune_config",
"MITHumanoidCfg": ".mit_humanoid.mit_humanoid_config",
"A1Cfg": ".a1.a1_config",
"AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config",
Expand All @@ -39,6 +40,7 @@
"MiniCheetahRunnerCfg": ".mini_cheetah.mini_cheetah_config",
"MiniCheetahRefRunnerCfg": ".mini_cheetah.mini_cheetah_ref_config",
"MiniCheetahOscRunnerCfg": ".mini_cheetah.mini_cheetah_osc_config",
"MiniCheetahFineTuneRunnerCfg": ".mini_cheetah.mini_cheetah_finetune_config",
"MITHumanoidRunnerCfg": ".mit_humanoid.mit_humanoid_config",
"A1RunnerCfg": ".a1.a1_config",
"AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config",
Expand All @@ -59,6 +61,11 @@
"MiniCheetahOscCfg",
"MiniCheetahOscRunnerCfg",
],
"mini_cheetah_finetune": [
"MiniCheetahRef",
"MiniCheetahFineTuneCfg",
"MiniCheetahFineTuneRunnerCfg",
],
"humanoid": ["MIT_Humanoid", "MITHumanoidCfg", "MITHumanoidRunnerCfg"],
"humanoid_running": [
"HumanoidRunning",
Expand Down
97 changes: 39 additions & 58 deletions gym/envs/base/fixed_robot.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,64 +40,60 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
self.reset()

def step(self):
"""Apply actions, simulate, call self.post_physics_step()
and pre_physics_step()

Args:
actions (torch.Tensor): Tensor of shape
(num_envs, num_actions_per_env)
"""

self._reset_buffers()
self._pre_physics_step()
# * step physics and render each frame
self._pre_decimation_step()
self._render()
for _ in range(self.cfg.control.decimation):
self._pre_compute_torques()
self.torques = self._compute_torques()

if self.cfg.asset.disable_motors:
self.torques[:] = 0.0
torques_to_gym_tensor = torch.zeros(
self.num_envs, self.num_dof, device=self.device
)

# todo encapsulate
next_torques_idx = 0
for dof_idx in range(self.num_dof):
if self.cfg.control.actuated_joints_mask[dof_idx]:
torques_to_gym_tensor[:, dof_idx] = self.torques[
:, next_torques_idx
]
next_torques_idx += 1
else:
torques_to_gym_tensor[:, dof_idx] = torch.zeros(
self.num_envs, device=self.device
)

self.gym.set_dof_actuation_force_tensor(
self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor)
)
self.gym.simulate(self.sim)
if self.device == "cpu":
self.gym.fetch_results(self.sim, True)
self.gym.refresh_dof_state_tensor(self.sim)

self._post_physics_step()
self._post_compute_torques()
self._step_physx_sim()
self._post_physx_step()
self._post_decimation_step()
self._check_terminations_and_timeouts()

env_ids = self.to_be_reset.nonzero(as_tuple=False).flatten()
self._reset_idx(env_ids)

def _pre_physics_step(self):
pass
def _pre_decimation_step(self):
return None

def _pre_compute_torques(self):
return None

def _post_compute_torques(self):
if self.cfg.asset.disable_motors:
self.torques[:] = 0.0

def _post_physics_step(self):
def _step_physx_sim(self):
next_torques_idx = 0
torques_to_gym_tensor = torch.zeros(
self.num_envs, self.num_dof, device=self.device
)
for dof_idx in range(self.num_dof):
if self.cfg.control.actuated_joints_mask[dof_idx]:
torques_to_gym_tensor[:, dof_idx] = self.torques[:, next_torques_idx]
next_torques_idx += 1
else:
torques_to_gym_tensor[:, dof_idx] = torch.zeros(
self.num_envs, device=self.device
)
self.gym.set_dof_actuation_force_tensor(
self.sim, gymtorch.unwrap_tensor(self.torques)
)
self.gym.simulate(self.sim)
if self.device == "cpu":
self.gym.fetch_results(self.sim, True)
self.gym.refresh_dof_state_tensor(self.sim)

def _post_physx_step(self):
"""
check terminations, compute observations and rewards
"""
self.gym.refresh_actor_root_state_tensor(self.sim)
self.gym.refresh_net_contact_force_tensor(self.sim)

def _post_decimation_step(self):
self.episode_length_buf += 1
self.common_step_counter += 1

Expand Down Expand Up @@ -212,18 +208,6 @@ def _process_rigid_body_props(self, props, env_id):
return props

def _compute_torques(self):
"""Compute torques from actions.
Actions can be interpreted as position or velocity targets given
to a PD controller, or directly as scaled torques.
[NOTE]: torques must have the same dimension as the number of DOFs,
even if some DOFs are not actuated.

Args:
actions (torch.Tensor): Actions

Returns:
[torch.Tensor]: Torques sent to the simulation
"""
actuated_dof_pos = torch.zeros(
self.num_envs, self.num_actuators, device=self.device
)
Expand Down Expand Up @@ -415,10 +399,7 @@ def _init_buffers(self):
self.default_act_pos = self.default_act_pos.unsqueeze(0)
# * store indices of actuated joints
self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device)
# * check that init range highs and lows are consistent
# * and repopulate to match
if self.cfg.init_state.reset_mode == "reset_to_range":
self.initialize_ranges_for_initial_conditions()
self.initialize_ranges_for_initial_conditions()

def initialize_ranges_for_initial_conditions(self):
self.dof_pos_range = torch.zeros(
Expand Down
52 changes: 28 additions & 24 deletions gym/envs/base/fixed_robot_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,34 +123,33 @@ class FixedRobotCfgPPO(BaseConfig):
class logging:
enable_local_saving = True

class policy:
class actor:
init_noise_std = 1.0
hidden_dims = [512, 256, 128]
critic_hidden_dims = [512, 256, 128]
# * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
# only for 'ActorCriticRecurrent':
# rnn_type = 'lstm'
# rnn_hidden_size = 512
# rnn_num_layers = 1

obs = [
"observation_a",
"observation_b",
"these_need_to_be_atributes_(states)_of_the_robot_env",
]

critic_obs = [
"observation_x",
"observation_y",
"critic_obs_can_be_the_same_or_different_than_actor_obs",
]
normalize_obs = True

actions = ["tau_ff"]
disable_actions = False

class noise:
noise = 0.1 # implement as needed, also in your robot class
observation_a = 0.1 # implement as needed, also in your robot class

class critic:
hidden_dims = [512, 256, 128]
activation = "elu"
normalize_obs = True
obs = [
"observation_x",
"observation_y",
"critic_obs_can_be_the_same_or_different_than_actor_obs",
]

class rewards:
class weights:
Expand All @@ -165,20 +164,25 @@ class termination_weight:
termination = 0.0

class algorithm:
# * training params
value_loss_coef = 1.0
use_clipped_value_loss = True
# both
gamma = 0.99
lam = 0.95
# shared
batch_size = 2**15
max_gradient_steps = 10
# new
storage_size = 2**17 # new
batch_size = 2**15 # new

clip_param = 0.2
entropy_coef = 0.01
num_learning_epochs = 5
# * mini batch size = num_envs*nsteps / nminibatches
num_mini_batches = 4
learning_rate = 1.0e-3
max_grad_norm = 1.0
# Critic
use_clipped_value_loss = True
# Actor
entropy_coef = 0.01
schedule = "adaptive" # could be adaptive, fixed
gamma = 0.99
lam = 0.95
desired_kl = 0.01
max_grad_norm = 1.0

class runner:
policy_class_name = "ActorCritic"
Expand Down
28 changes: 18 additions & 10 deletions gym/envs/base/legged_robot_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ class actor:
# can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
normalize_obs = True
smooth_exploration = False

obs = [
"observation_a",
Expand Down Expand Up @@ -288,20 +289,27 @@ class termination_weight:
termination = 0.01

class algorithm:
# * training params
value_loss_coef = 1.0
use_clipped_value_loss = True
# both
gamma = 0.99
lam = 0.95
# shared
batch_size = 2**15
max_gradient_steps = 24
# new
storage_size = 2**17 # new
batch_size = 2**15 # new

clip_param = 0.2
entropy_coef = 0.01
num_learning_epochs = 5
# * mini batch size = num_envs*nsteps / nminibatches
num_mini_batches = 4
learning_rate = 1.0e-3
max_grad_norm = 1.0
# Critic
use_clipped_value_loss = True
# Actor
entropy_coef = 0.01
schedule = "adaptive" # could be adaptive, fixed
gamma = 0.99
lam = 0.95
desired_kl = 0.01
max_grad_norm = 1.0
lr_range = [2e-4, 1e-2]
lr_ratio = 1.3

class runner:
policy_class_name = "ActorCritic"
Expand Down
3 changes: 2 additions & 1 deletion gym/envs/base/task_skeleton.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def reset(self):
"""Reset all robots"""
self._reset_idx(torch.arange(self.num_envs, device=self.device))
self.step()
self.episode_length_buf[:] = 0

def _reset_buffers(self):
self.to_be_reset[:] = False
Expand All @@ -67,7 +68,7 @@ def _eval_reward(self, name):
def _check_terminations_and_timeouts(self):
"""Check if environments need to be reset"""
contact_forces = self.contact_forces[:, self.termination_contact_indices, :]
self.terminated = torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
self.timed_out = self.episode_length_buf >= self.max_episode_length
self.to_be_reset = self.timed_out | self.terminated

Expand Down
2 changes: 1 addition & 1 deletion gym/envs/cartpole/cartpole_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO):
seed = -1
runner_class_name = "OnPolicyRunner"

class policy(FixedRobotCfgPPO.policy):
class actor(FixedRobotCfgPPO.actor):
init_noise_std = 1.0
num_layers = 2
num_units = 32
Expand Down
19 changes: 4 additions & 15 deletions gym/envs/mini_cheetah/mini_cheetah_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ class actor:
hidden_dims = [256, 256, 128]
# * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
smooth_exploration = True
exploration_sample_freq = 16

obs = [
"base_lin_vel",
Expand Down Expand Up @@ -190,24 +192,11 @@ class termination_weight:
termination = 0.01

class algorithm(LeggedRobotRunnerCfg.algorithm):
# * training params
value_loss_coef = 1.0
use_clipped_value_loss = True
clip_param = 0.2
entropy_coef = 0.02
num_learning_epochs = 4
# * mini batch size = num_envs*nsteps / nminibatches
num_mini_batches = 8
learning_rate = 1.0e-5
schedule = "adaptive" # can be adaptive or fixed
discount_horizon = 1.0 # [s]
# GAE_bootstrap_horizon = 2.0 # [s]
desired_kl = 0.01
max_grad_norm = 1.0
pass

class runner(LeggedRobotRunnerCfg.runner):
run_name = ""
experiment_name = "mini_cheetah"
max_iterations = 500
max_iterations = 800
algorithm_class_name = "PPO2"
num_steps_per_env = 32
Loading