Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
ac6cab1
initial commit for off-policy (but its still PPO2)
sheim Mar 25, 2024
df6a655
pendulum baseline
sheim Mar 25, 2024
0a20324
quick fix on storage config
sheim Mar 26, 2024
6fd4ba9
refactor GAE computation to be explicit
sheim Mar 26, 2024
890d562
set up off-policy, and SAC value function update.
sheim Mar 26, 2024
200a147
store next_obs explicitly, use for last_values
sheim Mar 26, 2024
96d5abb
refactor critic loss function to consume obs and self-evaluate
sheim Mar 26, 2024
bf05612
WIP: should be SAC now, with a lot of rough edges, and missing the ta…
sheim Mar 27, 2024
692502c
zap useless code
sheim Mar 27, 2024
5c7d898
WIP: SAC implementation (based off rsl_rl) runs
sheim Mar 28, 2024
c542214
update fixed_robot to match legged_robot refactor
sheim Mar 28, 2024
a5ca4fd
WIP: separate config for pendulum SAC, comments on missing components
sheim Mar 28, 2024
cf5cecd
WIP: partly fill replay buffer before training, bugfix on generator, …
sheim Apr 1, 2024
82e2179
switch back to elu, remove grad clipping
sheim Apr 1, 2024
7da0fab
match URDF, obs, ctrl freq, etc to gymansium
sheim Apr 4, 2024
1feafde
uniform sampling during initial fill, proper inference actions for play
sheim Apr 4, 2024
1969e0d
WIP: fix polyak bug, some more tuning
sheim Apr 5, 2024
b69f8c0
normalize advantages only after computing returns
sheim Apr 8, 2024
7db3f14
WIP: some tuning and fixes
sheim Apr 11, 2024
fe0a489
WIP: some more reasonable tuning for single env
sheim Apr 11, 2024
3808741
refactor chimera actor hidden layers
sheim Apr 12, 2024
5636bfc
quickfix of init_fill prinout
sheim Apr 12, 2024
cb51752
fix replay buffer
sheim Apr 18, 2024
eda4827
update configs to be closer to SB3
sheim Apr 18, 2024
67f75cc
derp, min instead of max -_- derp
sheim Apr 18, 2024
4eb0966
split environment config for SAC pendulum
sheim Apr 18, 2024
5ee7674
WIP: random tweaks and tuning before switching to other stuff
sheim Apr 22, 2024
e10c836
burn in normalization before learning
sheim Apr 26, 2024
acedd97
fix configs wrt to normalization, and fixed_base
sheim Apr 26, 2024
4f87735
Merge pull request #17 from mit-biomimetics/runner_burnin
sheim Apr 29, 2024
a79fae1
ppo2 works again on pendulum (lqr/testing config)
lukasmolnar May 31, 2024
3f81326
revert some pendulum changes, SAC runs now but doesn't learn
lukasmolnar May 31, 2024
9278807
sac pendulum converges somewhat (bugfixes + new rewards)
lukasmolnar Jun 5, 2024
a6d4adf
increase LR (all pendulums balance now)
lukasmolnar Jun 5, 2024
02611d0
increase LR and episode len
lukasmolnar Jun 6, 2024
dd9704e
added SAC mini cheetah and refactorings
lukasmolnar Jun 7, 2024
65eb253
PPO and SAC work on same rewards and config
lukasmolnar Jun 12, 2024
2ccfd8f
refactorings
lukasmolnar Jun 12, 2024
433b2f5
Merge branch 'dev' into SAC
sheim Jun 14, 2024
49ddec2
fixes
lukasmolnar Jun 14, 2024
f61f6b5
Merge remote-tracking branch 'origin/SAC' into lm/SAC_dev
lukasmolnar Jun 14, 2024
1721d06
Merge pull request #21 from mit-biomimetics/lm/SAC_dev
sheim Jun 14, 2024
5025875
visualize value function for PPO
lukasmolnar Jun 15, 2024
fa49617
forgot export_to_numpy in dict_utils
lukasmolnar Jun 15, 2024
7b44683
visualize SAC critics, update plotting for both SAC and PPO
lukasmolnar Jun 17, 2024
39a767f
slimmed down runner
sheim Jul 2, 2024
31968b6
adress comments
lukasmolnar Jul 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions gym/envs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,31 +19,35 @@
"Anymal": ".anymal_c.anymal",
"A1": ".a1.a1",
"HumanoidRunning": ".mit_humanoid.humanoid_running",
"Pendulum": ".pendulum.pendulum",
"Pendulum": ".pendulum.pendulum"
}

config_dict = {
"CartpoleCfg": ".cartpole.cartpole_config",
"MiniCheetahCfg": ".mini_cheetah.mini_cheetah_config",
"MiniCheetahRefCfg": ".mini_cheetah.mini_cheetah_ref_config",
"MiniCheetahOscCfg": ".mini_cheetah.mini_cheetah_osc_config",
"MiniCheetahSACCfg": ".mini_cheetah.mini_cheetah_SAC_config",
"MITHumanoidCfg": ".mit_humanoid.mit_humanoid_config",
"A1Cfg": ".a1.a1_config",
"AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config",
"HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config",
"PendulumCfg": ".pendulum.pendulum_config",
"PendulumSACCfg": ".pendulum.pendulum_SAC_config",
}

runner_config_dict = {
"CartpoleRunnerCfg": ".cartpole.cartpole_config",
"MiniCheetahRunnerCfg": ".mini_cheetah.mini_cheetah_config",
"MiniCheetahRefRunnerCfg": ".mini_cheetah.mini_cheetah_ref_config",
"MiniCheetahOscRunnerCfg": ".mini_cheetah.mini_cheetah_osc_config",
"MiniCheetahSACRunnerCfg": ".mini_cheetah.mini_cheetah_SAC_config",
"MITHumanoidRunnerCfg": ".mit_humanoid.mit_humanoid_config",
"A1RunnerCfg": ".a1.a1_config",
"AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config",
"HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config",
"PendulumRunnerCfg": ".pendulum.pendulum_config",
"PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config",
}

task_dict = {
Expand All @@ -59,14 +63,20 @@
"MiniCheetahOscCfg",
"MiniCheetahOscRunnerCfg",
],
"sac_mini_cheetah": [
"MiniCheetahRef",
"MiniCheetahSACCfg",
"MiniCheetahSACRunnerCfg"
],
"humanoid": ["MIT_Humanoid", "MITHumanoidCfg", "MITHumanoidRunnerCfg"],
"humanoid_running": [
"HumanoidRunning",
"HumanoidRunningCfg",
"HumanoidRunningRunnerCfg",
],
"flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"],
"pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"]
"pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"],
"sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"],
}

for class_name, class_location in class_dict.items():
Expand Down
97 changes: 39 additions & 58 deletions gym/envs/base/fixed_robot.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,64 +40,60 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
self.reset()

def step(self):
"""Apply actions, simulate, call self.post_physics_step()
and pre_physics_step()

Args:
actions (torch.Tensor): Tensor of shape
(num_envs, num_actions_per_env)
"""

self._reset_buffers()
self._pre_physics_step()
# * step physics and render each frame
self._pre_decimation_step()
self._render()
for _ in range(self.cfg.control.decimation):
self._pre_compute_torques()
self.torques = self._compute_torques()

if self.cfg.asset.disable_motors:
self.torques[:] = 0.0
torques_to_gym_tensor = torch.zeros(
self.num_envs, self.num_dof, device=self.device
)

# todo encapsulate
next_torques_idx = 0
for dof_idx in range(self.num_dof):
if self.cfg.control.actuated_joints_mask[dof_idx]:
torques_to_gym_tensor[:, dof_idx] = self.torques[
:, next_torques_idx
]
next_torques_idx += 1
else:
torques_to_gym_tensor[:, dof_idx] = torch.zeros(
self.num_envs, device=self.device
)

self.gym.set_dof_actuation_force_tensor(
self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor)
)
self.gym.simulate(self.sim)
if self.device == "cpu":
self.gym.fetch_results(self.sim, True)
self.gym.refresh_dof_state_tensor(self.sim)

self._post_physics_step()
self._post_compute_torques()
self._step_physx_sim()
self._post_physx_step()
self._post_decimation_step()
self._check_terminations_and_timeouts()

env_ids = self.to_be_reset.nonzero(as_tuple=False).flatten()
self._reset_idx(env_ids)

def _pre_physics_step(self):
pass
def _pre_decimation_step(self):
return None

def _pre_compute_torques(self):
return None

def _post_compute_torques(self):
if self.cfg.asset.disable_motors:
self.torques[:] = 0.0

def _post_physics_step(self):
def _step_physx_sim(self):
next_torques_idx = 0
torques_to_gym_tensor = torch.zeros(
self.num_envs, self.num_dof, device=self.device
)
for dof_idx in range(self.num_dof):
if self.cfg.control.actuated_joints_mask[dof_idx]:
torques_to_gym_tensor[:, dof_idx] = self.torques[:, next_torques_idx]
next_torques_idx += 1
else:
torques_to_gym_tensor[:, dof_idx] = torch.zeros(
self.num_envs, device=self.device
)
self.gym.set_dof_actuation_force_tensor(
self.sim, gymtorch.unwrap_tensor(self.torques)
)
self.gym.simulate(self.sim)
if self.device == "cpu":
self.gym.fetch_results(self.sim, True)
self.gym.refresh_dof_state_tensor(self.sim)

def _post_physx_step(self):
"""
check terminations, compute observations and rewards
"""
self.gym.refresh_actor_root_state_tensor(self.sim)
self.gym.refresh_net_contact_force_tensor(self.sim)

def _post_decimation_step(self):
self.episode_length_buf += 1
self.common_step_counter += 1

Expand Down Expand Up @@ -212,18 +208,6 @@ def _process_rigid_body_props(self, props, env_id):
return props

def _compute_torques(self):
"""Compute torques from actions.
Actions can be interpreted as position or velocity targets given
to a PD controller, or directly as scaled torques.
[NOTE]: torques must have the same dimension as the number of DOFs,
even if some DOFs are not actuated.

Args:
actions (torch.Tensor): Actions

Returns:
[torch.Tensor]: Torques sent to the simulation
"""
actuated_dof_pos = torch.zeros(
self.num_envs, self.num_actuators, device=self.device
)
Expand Down Expand Up @@ -415,10 +399,7 @@ def _init_buffers(self):
self.default_act_pos = self.default_act_pos.unsqueeze(0)
# * store indices of actuated joints
self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device)
# * check that init range highs and lows are consistent
# * and repopulate to match
if self.cfg.init_state.reset_mode == "reset_to_range":
self.initialize_ranges_for_initial_conditions()
self.initialize_ranges_for_initial_conditions()

def initialize_ranges_for_initial_conditions(self):
self.dof_pos_range = torch.zeros(
Expand Down
53 changes: 29 additions & 24 deletions gym/envs/base/fixed_robot_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,34 +123,33 @@ class FixedRobotCfgPPO(BaseConfig):
class logging:
enable_local_saving = True

class policy:
class actor:
init_noise_std = 1.0
hidden_dims = [512, 256, 128]
critic_hidden_dims = [512, 256, 128]
# * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
# only for 'ActorCriticRecurrent':
# rnn_type = 'lstm'
# rnn_hidden_size = 512
# rnn_num_layers = 1

obs = [
"observation_a",
"observation_b",
"these_need_to_be_atributes_(states)_of_the_robot_env",
]

critic_obs = [
"observation_x",
"observation_y",
"critic_obs_can_be_the_same_or_different_than_actor_obs",
]
normalize_obs = True

actions = ["tau_ff"]
disable_actions = False

class noise:
noise = 0.1 # implement as needed, also in your robot class
observation_a = 0.1 # implement as needed, also in your robot class

class critic:
hidden_dims = [512, 256, 128]
activation = "elu"
normalize_obs = True
obs = [
"observation_x",
"observation_y",
"critic_obs_can_be_the_same_or_different_than_actor_obs",
]

class rewards:
class weights:
Expand All @@ -165,20 +164,25 @@ class termination_weight:
termination = 0.0

class algorithm:
# * training params
value_loss_coef = 1.0
use_clipped_value_loss = True
# both
gamma = 0.99
lam = 0.95
# shared
batch_size = 2**15
max_gradient_steps = 10
# new
storage_size = 2**17 # new
batch_size = 2**15 # new

clip_param = 0.2
entropy_coef = 0.01
num_learning_epochs = 5
# * mini batch size = num_envs*nsteps / nminibatches
num_mini_batches = 4
learning_rate = 1.0e-3
max_grad_norm = 1.0
# Critic
use_clipped_value_loss = True
# Actor
entropy_coef = 0.01
schedule = "adaptive" # could be adaptive, fixed
gamma = 0.99
lam = 0.95
desired_kl = 0.01
max_grad_norm = 1.0

class runner:
policy_class_name = "ActorCritic"
Expand All @@ -189,6 +193,7 @@ class runner:
# * logging
# * check for potential saves every this many iterations
save_interval = 50
log_storage = False
run_name = ""
experiment_name = "fixed_robot"

Expand Down
30 changes: 17 additions & 13 deletions gym/envs/base/legged_robot_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,13 +238,12 @@ class actor:
hidden_dims = [512, 256, 128]
# can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
normalize_obs = True

obs = [
"observation_a",
"observation_b",
"these_need_to_be_atributes_(states)_of_the_robot_env",
]
normalize_obs = True

actions = ["q_des"]
disable_actions = False
Expand Down Expand Up @@ -288,25 +287,30 @@ class termination_weight:
termination = 0.01

class algorithm:
# * training params
value_loss_coef = 1.0
use_clipped_value_loss = True
# both
gamma = 0.99
lam = 0.95
# shared
batch_size = 2**15
max_gradient_steps = 10
# new
storage_size = 2**17 # new
batch_size = 2**15 # new

clip_param = 0.2
entropy_coef = 0.01
num_learning_epochs = 5
# * mini batch size = num_envs*nsteps / nminibatches
num_mini_batches = 4
learning_rate = 1.0e-3
max_grad_norm = 1.0
# Critic
use_clipped_value_loss = True
# Actor
entropy_coef = 0.01
schedule = "adaptive" # could be adaptive, fixed
gamma = 0.99
lam = 0.95
desired_kl = 0.01
max_grad_norm = 1.0

class runner:
policy_class_name = "ActorCritic"
algorithm_class_name = "PPO2"
num_steps_per_env = 24
num_steps_per_env = 24 # deprecate
max_iterations = 1500
save_interval = 50
run_name = ""
Expand Down
3 changes: 2 additions & 1 deletion gym/envs/base/task_skeleton.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def reset(self):
"""Reset all robots"""
self._reset_idx(torch.arange(self.num_envs, device=self.device))
self.step()
self.episode_length_buf[:] = 0

def _reset_buffers(self):
self.to_be_reset[:] = False
Expand All @@ -67,7 +68,7 @@ def _eval_reward(self, name):
def _check_terminations_and_timeouts(self):
"""Check if environments need to be reset"""
contact_forces = self.contact_forces[:, self.termination_contact_indices, :]
self.terminated = torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
self.timed_out = self.episode_length_buf >= self.max_episode_length
self.to_be_reset = self.timed_out | self.terminated

Expand Down
2 changes: 1 addition & 1 deletion gym/envs/cartpole/cartpole_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO):
seed = -1
runner_class_name = "OnPolicyRunner"

class policy(FixedRobotCfgPPO.policy):
class actor(FixedRobotCfgPPO.actor):
init_noise_std = 1.0
num_layers = 2
num_units = 32
Expand Down
Loading